diff --git a/.github/workflows/os-check.yml b/.github/workflows/os-check.yml
index a6b9b945940..f5bf407efad 100644
--- a/.github/workflows/os-check.yml
+++ b/.github/workflows/os-check.yml
@@ -542,6 +542,12 @@ jobs:
fail-fast: false
matrix:
arch: [ x64, Win32, ARM64 ]
+ asm: [ false ]
+ include:
+ # Intel assembly build (x64 only): assembles the crypto .asm files
+ # and enables the matching USE_INTEL_SPEEDUP code paths.
+ - arch: x64
+ asm: true
# This should be a safe limit for the tests to run.
timeout-minutes: 6
env:
@@ -566,7 +572,7 @@ jobs:
working-directory: ${{env.GITHUB_WORKSPACE}}
# Add additional options to the MSBuild command line here (like platform or verbosity level).
# See https://docs.microsoft.com/visualstudio/msbuild/msbuild-command-line-reference
- run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{matrix.arch}} /p:Configuration=${{env.BUILD_CONFIGURATION}} ${{env.SOLUTION_FILE_PATH}}
+ run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{matrix.arch}} /p:Configuration=${{env.BUILD_CONFIGURATION}} /p:WolfSSLIntelAsm=${{matrix.asm}} ${{env.SOLUTION_FILE_PATH}}
- if: ${{ matrix.arch != 'ARM64' }}
name: Run Test
diff --git a/.github/workflows/win-csharp-test.yml b/.github/workflows/win-csharp-test.yml
index d37637e566e..001ac0fd96a 100644
--- a/.github/workflows/win-csharp-test.yml
+++ b/.github/workflows/win-csharp-test.yml
@@ -13,6 +13,13 @@ jobs:
if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }}
runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ # false: pure C. true: assemble the crypto .asm files and enable the
+ # USE_INTEL_SPEEDUP code paths (x64).
+ asm: [ false, true ]
+
# This should be a safe limit for the tests to run.
timeout-minutes: 6
@@ -48,7 +55,7 @@ jobs:
working-directory: ${{env.GITHUB_WORKSPACE}}
# Add additional options to the MSBuild command line here (like platform or verbosity level).
# See https://docs.microsoft.com/visualstudio/msbuild/msbuild-command-line-reference
- run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{env.BUILD_PLATFORM}} /p:Configuration=${{env.BUILD_CONFIGURATION}} ${{env.SOLUTION_FILE_PATH}}
+ run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{env.BUILD_PLATFORM}} /p:Configuration=${{env.BUILD_CONFIGURATION}} /p:WolfSSLIntelAsm=${{matrix.asm}} ${{env.SOLUTION_FILE_PATH}}
- name: Run wolfCrypt test
working-directory: ${{env.GITHUB_WORKSPACE}}wolfssl\wrapper\CSharp\Debug\x64\
diff --git a/examples/client/client.vcxproj b/examples/client/client.vcxproj
index 0843627d584..d6a21467c1c 100644
--- a/examples/client/client.vcxproj
+++ b/examples/client/client.vcxproj
@@ -478,6 +478,14 @@
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/examples/echoclient/echoclient.vcxproj b/examples/echoclient/echoclient.vcxproj
index 68eb81b1d5c..233b1cdbd28 100644
--- a/examples/echoclient/echoclient.vcxproj
+++ b/examples/echoclient/echoclient.vcxproj
@@ -478,6 +478,14 @@
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/examples/echoserver/echoserver.vcxproj b/examples/echoserver/echoserver.vcxproj
index 68c4f16800a..29f440f56ce 100644
--- a/examples/echoserver/echoserver.vcxproj
+++ b/examples/echoserver/echoserver.vcxproj
@@ -478,6 +478,14 @@
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/examples/server/server.vcxproj b/examples/server/server.vcxproj
index 3695fc1eb6b..9343976a6d7 100644
--- a/examples/server/server.vcxproj
+++ b/examples/server/server.vcxproj
@@ -478,6 +478,14 @@
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/sslSniffer/sslSniffer.vcxproj b/sslSniffer/sslSniffer.vcxproj
index 88bbc963fe4..4925b99b832 100644
--- a/sslSniffer/sslSniffer.vcxproj
+++ b/sslSniffer/sslSniffer.vcxproj
@@ -256,6 +256,14 @@
false
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj b/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj
index 8d4cb32aca1..f98f33cc1fd 100644
--- a/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj
+++ b/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj
@@ -1,263 +1,271 @@
-
-
-
-
- Debug
- Win32
-
-
- Debug
- x64
-
-
- Debug
- ARM64
-
-
- Release
- Win32
-
-
- Release
- x64
-
-
- Release
- ARM64
-
-
-
- {8C89E16E-9C36-45EF-A491-F4EBD4A8D8F1}
- sslSniffTest
- Win32Proj
- 10.0
-
-
-
- Application
- v141
- Unicode
- true
-
-
- Application
- v141
- Unicode
- true
-
-
- Application
- v141
- Unicode
- true
-
-
- Application
- v141
- Unicode
-
-
- Application
- v141
- Unicode
-
-
- Application
- v141
- Unicode
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- <_ProjectFileVersion>15.0.28307.799
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- true
- snifftest
-
-
- true
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- $(SolutionDir)$(Configuration)\$(Platform)\
- snifftest
-
-
- true
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- $(SolutionDir)$(Configuration)\$(Platform)\
- snifftest
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- false
- snifftest
-
-
- false
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- $(SolutionDir)$(Configuration)\$(Platform)\
- snifftest
-
-
- false
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
- $(SolutionDir)$(Configuration)\$(Platform)\
- snifftest
-
-
-
- Disabled
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
- true
- EnableFastChecks
- MultiThreadedDebugDLL
-
- Level3
- EditAndContinue
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
- MachineX86
-
-
-
-
- Disabled
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level3
- ProgramDatabase
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
-
-
-
-
- Disabled
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level3
- ProgramDatabase
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
-
-
-
-
- MaxSpeed
- true
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
- Level3
- ProgramDatabase
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
- true
- true
- MachineX86
-
-
-
-
- MaxSpeed
- true
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
- true
- true
-
-
-
-
- MaxSpeed
- true
- ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
- WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
- wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
- ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
- true
- Console
- true
- true
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+ Debug
+ Win32
+
+
+ Debug
+ x64
+
+
+ Debug
+ ARM64
+
+
+ Release
+ Win32
+
+
+ Release
+ x64
+
+
+ Release
+ ARM64
+
+
+
+ {8C89E16E-9C36-45EF-A491-F4EBD4A8D8F1}
+ sslSniffTest
+ Win32Proj
+ 10.0
+
+
+
+ Application
+ v141
+ Unicode
+ true
+
+
+ Application
+ v141
+ Unicode
+ true
+
+
+ Application
+ v141
+ Unicode
+ true
+
+
+ Application
+ v141
+ Unicode
+
+
+ Application
+ v141
+ Unicode
+
+
+ Application
+ v141
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <_ProjectFileVersion>15.0.28307.799
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ true
+ snifftest
+
+
+ true
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ snifftest
+
+
+ true
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ snifftest
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ false
+ snifftest
+
+
+ false
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ snifftest
+
+
+ false
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ snifftest
+
+
+
+ Disabled
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ true
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+ Level3
+ EditAndContinue
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+ MachineX86
+
+
+
+
+ Disabled
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level3
+ ProgramDatabase
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+
+
+
+
+ Disabled
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level3
+ ProgramDatabase
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+
+
+
+
+ MaxSpeed
+ true
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+ Level3
+ ProgramDatabase
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+ true
+ true
+ MachineX86
+
+
+
+
+ MaxSpeed
+ true
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+ true
+ true
+
+
+
+
+ MaxSpeed
+ true
+ ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories)
+ WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+ wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies)
+ ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories)
+ true
+ Console
+ true
+ true
+
+
+
+
+
+
+
+
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
+
+
+
\ No newline at end of file
diff --git a/testsuite/testsuite.vcxproj b/testsuite/testsuite.vcxproj
index baa2760f72c..8bc4242f0aa 100644
--- a/testsuite/testsuite.vcxproj
+++ b/testsuite/testsuite.vcxproj
@@ -484,6 +484,14 @@
+
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm
index 34f68476310..d7947b63a0d 100644
--- a/wolfcrypt/src/aes_gcm_asm.asm
+++ b/wolfcrypt/src/aes_gcm_asm.asm
@@ -43,14 +43,12 @@ ENDIF
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_aesni_rev8 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_GCM_generate_m0_aesni_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_GCM_generate_m0_aesni_rev8 QWORD L_GCM_generate_m0_aesni_rev8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_aesni_mod2_128 QWORD \
- 0000000000000000h, 0e100000000000000h
+L_GCM_generate_m0_aesni_mod2_128 QWORD 0000000000000000h, 0e100000000000000h
ptr_L_GCM_generate_m0_aesni_mod2_128 QWORD L_GCM_generate_m0_aesni_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -298,68 +296,57 @@ GCM_generate_m0_aesni ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_one QWORD \
- 0000000000000000h, 0000000000000001h
+L_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h
ptr_L_aes_gcm_one QWORD L_aes_gcm_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_two QWORD \
- 0000000000000000h, 0000000000000002h
+L_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h
ptr_L_aes_gcm_two QWORD L_aes_gcm_two
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_three QWORD \
- 0000000000000000h, 0000000000000003h
+L_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h
ptr_L_aes_gcm_three QWORD L_aes_gcm_three
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_four QWORD \
- 0000000000000000h, 0000000000000004h
+L_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h
ptr_L_aes_gcm_four QWORD L_aes_gcm_four
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_five QWORD \
- 0000000000000000h, 0000000000000005h
+L_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h
ptr_L_aes_gcm_five QWORD L_aes_gcm_five
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_six QWORD \
- 0000000000000000h, 0000000000000006h
+L_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h
ptr_L_aes_gcm_six QWORD L_aes_gcm_six
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_seven QWORD \
- 0000000000000000h, 0000000000000007h
+L_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h
ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_eight QWORD \
- 0000000000000000h, 0000000000000008h
+L_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h
ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_bswap_epi64 QWORD \
- 0001020304050607h, 08090a0b0c0d0e0fh
+L_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh
ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_bswap_mask QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_gcm_mod2_128 QWORD \
- 0000000000000001h, 0c200000000000000h
+L_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h
ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -6472,14 +6459,12 @@ _TEXT ENDS
IFDEF HAVE_INTEL_AVX1
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx1_rev8 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_GCM_generate_m0_avx1_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_GCM_generate_m0_avx1_rev8 QWORD L_GCM_generate_m0_avx1_rev8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx1_mod2_128 QWORD \
- 0000000000000000h, 0e100000000000000h
+L_GCM_generate_m0_avx1_mod2_128 QWORD 0000000000000000h, 0e100000000000000h
ptr_L_GCM_generate_m0_avx1_mod2_128 QWORD L_GCM_generate_m0_avx1_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -6693,68 +6678,57 @@ GCM_generate_m0_avx1 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_one QWORD \
- 0000000000000000h, 0000000000000001h
+L_avx1_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h
ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_two QWORD \
- 0000000000000000h, 0000000000000002h
+L_avx1_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h
ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_three QWORD \
- 0000000000000000h, 0000000000000003h
+L_avx1_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h
ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_four QWORD \
- 0000000000000000h, 0000000000000004h
+L_avx1_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h
ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_five QWORD \
- 0000000000000000h, 0000000000000005h
+L_avx1_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h
ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_six QWORD \
- 0000000000000000h, 0000000000000006h
+L_avx1_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h
ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_seven QWORD \
- 0000000000000000h, 0000000000000007h
+L_avx1_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h
ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_eight QWORD \
- 0000000000000000h, 0000000000000008h
+L_avx1_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h
ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_bswap_epi64 QWORD \
- 0001020304050607h, 08090a0b0c0d0e0fh
+L_avx1_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh
ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_bswap_mask QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_avx1_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_gcm_mod2_128 QWORD \
- 0000000000000001h, 0c200000000000000h
+L_avx1_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h
ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -11933,14 +11907,12 @@ ENDIF
IFDEF HAVE_INTEL_AVX2
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx2_rev8 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_GCM_generate_m0_avx2_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_GCM_generate_m0_avx2_rev8 QWORD L_GCM_generate_m0_avx2_rev8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx2_mod2_128 QWORD \
- 0000000000000000h, 0e100000000000000h
+L_GCM_generate_m0_avx2_mod2_128 QWORD 0000000000000000h, 0e100000000000000h
ptr_L_GCM_generate_m0_avx2_mod2_128 QWORD L_GCM_generate_m0_avx2_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -12154,74 +12126,62 @@ GCM_generate_m0_avx2 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_one QWORD \
- 0000000000000000h, 0000000000000001h
+L_avx2_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h
ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_two QWORD \
- 0000000000000000h, 0000000000000002h
+L_avx2_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h
ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_three QWORD \
- 0000000000000000h, 0000000000000003h
+L_avx2_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h
ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_four QWORD \
- 0000000000000000h, 0000000000000004h
+L_avx2_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h
ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_five QWORD \
- 0000000000000000h, 0000000000000005h
+L_avx2_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h
ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_six QWORD \
- 0000000000000000h, 0000000000000006h
+L_avx2_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h
ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_seven QWORD \
- 0000000000000000h, 0000000000000007h
+L_avx2_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h
ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_eight QWORD \
- 0000000000000000h, 0000000000000008h
+L_avx2_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h
ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_bswap_one QWORD \
- 0000000000000000h, 0100000000000000h
+L_avx2_aes_gcm_bswap_one QWORD 0000000000000000h, 0100000000000000h
ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_bswap_epi64 QWORD \
- 0001020304050607h, 08090a0b0c0d0e0fh
+L_avx2_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh
ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_bswap_mask QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_avx2_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx2_aes_gcm_mod2_128 QWORD \
- 0000000000000001h, 0c200000000000000h
+L_avx2_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h
ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -16521,42 +16481,36 @@ ENDIF
IFDEF HAVE_INTEL_VAES
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_inc_y0 QWORD \
- 0000000000000000h, 0000000000000000h,
- 0000000000000000h, 0000000000000001h
+L_vaes_aes_gcm_inc_y0 QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000001h
ptr_L_vaes_aes_gcm_inc_y0 QWORD L_vaes_aes_gcm_inc_y0
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_inc_y1 QWORD \
- 0000000000000000h, 0000000000000002h,
- 0000000000000000h, 0000000000000003h
+L_vaes_aes_gcm_inc_y1 QWORD 0000000000000000h, 0000000000000002h
+ QWORD 0000000000000000h, 0000000000000003h
ptr_L_vaes_aes_gcm_inc_y1 QWORD L_vaes_aes_gcm_inc_y1
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_inc_y2 QWORD \
- 0000000000000000h, 0000000000000004h,
- 0000000000000000h, 0000000000000005h
+L_vaes_aes_gcm_inc_y2 QWORD 0000000000000000h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000005h
ptr_L_vaes_aes_gcm_inc_y2 QWORD L_vaes_aes_gcm_inc_y2
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_inc_y3 QWORD \
- 0000000000000000h, 0000000000000006h,
- 0000000000000000h, 0000000000000007h
+L_vaes_aes_gcm_inc_y3 QWORD 0000000000000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000007h
ptr_L_vaes_aes_gcm_inc_y3 QWORD L_vaes_aes_gcm_inc_y3
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_vaes_rev8 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_GCM_generate_m0_vaes_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_GCM_generate_m0_vaes_rev8 QWORD L_GCM_generate_m0_vaes_rev8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_vaes_mod2_128 QWORD \
- 0000000000000000h, 0e100000000000000h
+L_GCM_generate_m0_vaes_mod2_128 QWORD 0000000000000000h, 0e100000000000000h
ptr_L_GCM_generate_m0_vaes_mod2_128 QWORD L_GCM_generate_m0_vaes_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -16770,68 +16724,57 @@ GCM_generate_m0_vaes ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_one QWORD \
- 0000000000000000h, 0000000000000001h
+L_vaes_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h
ptr_L_vaes_aes_gcm_one QWORD L_vaes_aes_gcm_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_two QWORD \
- 0000000000000000h, 0000000000000002h
+L_vaes_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h
ptr_L_vaes_aes_gcm_two QWORD L_vaes_aes_gcm_two
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_three QWORD \
- 0000000000000000h, 0000000000000003h
+L_vaes_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h
ptr_L_vaes_aes_gcm_three QWORD L_vaes_aes_gcm_three
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_four QWORD \
- 0000000000000000h, 0000000000000004h
+L_vaes_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h
ptr_L_vaes_aes_gcm_four QWORD L_vaes_aes_gcm_four
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_five QWORD \
- 0000000000000000h, 0000000000000005h
+L_vaes_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h
ptr_L_vaes_aes_gcm_five QWORD L_vaes_aes_gcm_five
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_six QWORD \
- 0000000000000000h, 0000000000000006h
+L_vaes_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h
ptr_L_vaes_aes_gcm_six QWORD L_vaes_aes_gcm_six
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_seven QWORD \
- 0000000000000000h, 0000000000000007h
+L_vaes_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h
ptr_L_vaes_aes_gcm_seven QWORD L_vaes_aes_gcm_seven
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_eight QWORD \
- 0000000000000000h, 0000000000000008h
+L_vaes_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h
ptr_L_vaes_aes_gcm_eight QWORD L_vaes_aes_gcm_eight
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_bswap_epi64 QWORD \
- 0001020304050607h, 08090a0b0c0d0e0fh
+L_vaes_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh
ptr_L_vaes_aes_gcm_bswap_epi64 QWORD L_vaes_aes_gcm_bswap_epi64
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_bswap_mask QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_vaes_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_vaes_aes_gcm_bswap_mask QWORD L_vaes_aes_gcm_bswap_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_gcm_mod2_128 QWORD \
- 0000000000000001h, 0c200000000000000h
+L_vaes_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h
ptr_L_vaes_aes_gcm_mod2_128 QWORD L_vaes_aes_gcm_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -17587,7 +17530,7 @@ L_AES_GCM_encrypt_vaes_loop_256:
lea rcx, QWORD PTR [rsi+rbx]
mov QWORD PTR [rsp+544], rcx
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -17599,81 +17542,81 @@ L_AES_GCM_encrypt_vaes_loop_256:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -17694,7 +17637,7 @@ L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last:
vpxor ymm3, ymm3, ymm5
vmovdqu YMMWORD PTR [rdx+96], ymm3
add ebx, 128
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -17706,81 +17649,81 @@ L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -17914,7 +17857,7 @@ L_AES_GCM_encrypt_vaes_after_256:
lea rcx, QWORD PTR [rsi+rbx]
mov QWORD PTR [rsp+544], rcx
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -17926,81 +17869,81 @@ L_AES_GCM_encrypt_vaes_after_256:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -19190,7 +19133,7 @@ L_AES_GCM_decrypt_vaes_loop_256:
vextracti128 xmm0, ymm13, 1
vpxor xmm15, xmm13, xmm0
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -19202,81 +19145,81 @@ L_AES_GCM_decrypt_vaes_loop_256:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -19297,7 +19240,7 @@ L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last:
vpxor ymm3, ymm3, ymm5
vmovdqu YMMWORD PTR [rdx+96], ymm3
add ebx, 128
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -19309,81 +19252,81 @@ L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -19475,7 +19418,7 @@ L_AES_GCM_decrypt_vaes_after_256:
vextracti128 xmm0, ymm13, 1
vpxor xmm15, xmm13, xmm0
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [rsp+512]
+ vbroadcasti128 ymm4, OWORD PTR [rsp+512]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -19487,81 +19430,81 @@ L_AES_GCM_decrypt_vaes_after_256:
vmovdqu xmm7, OWORD PTR [rsp+512]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [rsp+512], xmm7
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+16]
+ vbroadcasti128 ymm4, OWORD PTR [r15+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+32]
+ vbroadcasti128 ymm4, OWORD PTR [r15+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+48]
+ vbroadcasti128 ymm4, OWORD PTR [r15+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+64]
+ vbroadcasti128 ymm4, OWORD PTR [r15+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+80]
+ vbroadcasti128 ymm4, OWORD PTR [r15+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+96]
+ vbroadcasti128 ymm4, OWORD PTR [r15+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+112]
+ vbroadcasti128 ymm4, OWORD PTR [r15+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+128]
+ vbroadcasti128 ymm4, OWORD PTR [r15+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+144]
+ vbroadcasti128 ymm4, OWORD PTR [r15+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 11
- vbroadcasti128 ymm4, [r15+160]
+ vbroadcasti128 ymm4, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+176]
+ vbroadcasti128 ymm4, OWORD PTR [r15+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r10d, 13
- vbroadcasti128 ymm4, [r15+192]
+ vbroadcasti128 ymm4, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+208]
+ vbroadcasti128 ymm4, OWORD PTR [r15+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [r15+224]
+ vbroadcasti128 ymm4, OWORD PTR [r15+224]
L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -20618,7 +20561,7 @@ L_AES_GCM_encrypt_update_vaes_loop_256:
; 256 bytes of input
lea rsi, QWORD PTR [r10+rdi]
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -20630,81 +20573,81 @@ L_AES_GCM_encrypt_update_vaes_loop_256:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -20725,7 +20668,7 @@ L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last:
vpxor ymm3, ymm3, ymm5
vmovdqu YMMWORD PTR [rdx+96], ymm3
add edi, 128
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -20737,81 +20680,81 @@ L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -20943,7 +20886,7 @@ L_AES_GCM_encrypt_update_vaes_after_256:
; 128 bytes of input
lea rsi, QWORD PTR [r10+rdi]
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -20955,81 +20898,81 @@ L_AES_GCM_encrypt_update_vaes_after_256:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -21778,7 +21721,7 @@ L_AES_GCM_decrypt_update_vaes_loop_256:
vextracti128 xmm0, ymm13, 1
vpxor xmm15, xmm13, xmm0
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -21790,81 +21733,81 @@ L_AES_GCM_decrypt_update_vaes_loop_256:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -21885,7 +21828,7 @@ L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last:
vpxor ymm3, ymm3, ymm5
vmovdqu YMMWORD PTR [rdx+96], ymm3
add edi, 128
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -21897,81 +21840,81 @@ L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -22063,7 +22006,7 @@ L_AES_GCM_decrypt_update_vaes_after_256:
vextracti128 xmm0, ymm13, 1
vpxor xmm15, xmm13, xmm0
vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
- vbroadcasti128 ymm4, [r15]
+ vbroadcasti128 ymm4, OWORD PTR [r15]
vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
vpshufb ymm0, ymm0, ymm6
vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
@@ -22075,81 +22018,81 @@ L_AES_GCM_decrypt_update_vaes_after_256:
vmovdqu xmm7, OWORD PTR [r15]
vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
vmovdqu OWORD PTR [r15], xmm7
- vbroadcasti128 ymm4, [rax]
+ vbroadcasti128 ymm4, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm4
vpxor ymm1, ymm1, ymm4
vpxor ymm2, ymm2, ymm4
vpxor ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+16]
+ vbroadcasti128 ymm4, OWORD PTR [rax+16]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+32]
+ vbroadcasti128 ymm4, OWORD PTR [rax+32]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+48]
+ vbroadcasti128 ymm4, OWORD PTR [rax+48]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+64]
+ vbroadcasti128 ymm4, OWORD PTR [rax+64]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+80]
+ vbroadcasti128 ymm4, OWORD PTR [rax+80]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+96]
+ vbroadcasti128 ymm4, OWORD PTR [rax+96]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+112]
+ vbroadcasti128 ymm4, OWORD PTR [rax+112]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+128]
+ vbroadcasti128 ymm4, OWORD PTR [rax+128]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+144]
+ vbroadcasti128 ymm4, OWORD PTR [rax+144]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 11
- vbroadcasti128 ymm4, [rax+160]
+ vbroadcasti128 ymm4, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+176]
+ vbroadcasti128 ymm4, OWORD PTR [rax+176]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
cmp r8d, 13
- vbroadcasti128 ymm4, [rax+192]
+ vbroadcasti128 ymm4, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+208]
+ vbroadcasti128 ymm4, OWORD PTR [rax+208]
vaesenc ymm0, ymm0, ymm4
vaesenc ymm1, ymm1, ymm4
vaesenc ymm2, ymm2, ymm4
vaesenc ymm3, ymm3, ymm4
- vbroadcasti128 ymm4, [rax+224]
+ vbroadcasti128 ymm4, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last:
vaesenclast ymm0, ymm0, ymm4
vaesenclast ymm1, ymm1, ymm4
@@ -22372,56 +22315,49 @@ ENDIF
IFDEF HAVE_INTEL_AVX512
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_inc_z0 QWORD \
- 0000000000000000h, 0000000000000000h,
- 0000000000000000h, 0000000000000001h,
- 0000000000000000h, 0000000000000002h,
- 0000000000000000h, 0000000000000003h
+L_avx512_aes_gcm_inc_z0 QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000001h
+ QWORD 0000000000000000h, 0000000000000002h
+ QWORD 0000000000000000h, 0000000000000003h
ptr_L_avx512_aes_gcm_inc_z0 QWORD L_avx512_aes_gcm_inc_z0
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_inc_z1 QWORD \
- 0000000000000000h, 0000000000000004h,
- 0000000000000000h, 0000000000000005h,
- 0000000000000000h, 0000000000000006h,
- 0000000000000000h, 0000000000000007h
+L_avx512_aes_gcm_inc_z1 QWORD 0000000000000000h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000007h
ptr_L_avx512_aes_gcm_inc_z1 QWORD L_avx512_aes_gcm_inc_z1
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_inc_z2 QWORD \
- 0000000000000000h, 0000000000000008h,
- 0000000000000000h, 0000000000000009h,
- 0000000000000000h, 000000000000000ah,
- 0000000000000000h, 000000000000000bh
+L_avx512_aes_gcm_inc_z2 QWORD 0000000000000000h, 0000000000000008h
+ QWORD 0000000000000000h, 0000000000000009h
+ QWORD 0000000000000000h, 000000000000000ah
+ QWORD 0000000000000000h, 000000000000000bh
ptr_L_avx512_aes_gcm_inc_z2 QWORD L_avx512_aes_gcm_inc_z2
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_inc_z3 QWORD \
- 0000000000000000h, 000000000000000ch,
- 0000000000000000h, 000000000000000dh,
- 0000000000000000h, 000000000000000eh,
- 0000000000000000h, 000000000000000fh
+L_avx512_aes_gcm_inc_z3 QWORD 0000000000000000h, 000000000000000ch
+ QWORD 0000000000000000h, 000000000000000dh
+ QWORD 0000000000000000h, 000000000000000eh
+ QWORD 0000000000000000h, 000000000000000fh
ptr_L_avx512_aes_gcm_inc_z3 QWORD L_avx512_aes_gcm_inc_z3
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_sixteen QWORD \
- 0000000000000000h, 0000000000000010h
+L_avx512_aes_gcm_sixteen QWORD 0000000000000000h, 0000000000000010h
ptr_L_avx512_aes_gcm_sixteen QWORD L_avx512_aes_gcm_sixteen
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx512_rev8 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_GCM_generate_m0_avx512_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_GCM_generate_m0_avx512_rev8 QWORD L_GCM_generate_m0_avx512_rev8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_GCM_generate_m0_avx512_mod2_128 QWORD \
- 0000000000000000h, 0e100000000000000h
+L_GCM_generate_m0_avx512_mod2_128 QWORD 0000000000000000h, 0e100000000000000h
ptr_L_GCM_generate_m0_avx512_mod2_128 QWORD L_GCM_generate_m0_avx512_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -22635,68 +22571,57 @@ GCM_generate_m0_avx512 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_one QWORD \
- 0000000000000000h, 0000000000000001h
+L_avx512_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h
ptr_L_avx512_aes_gcm_one QWORD L_avx512_aes_gcm_one
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_two QWORD \
- 0000000000000000h, 0000000000000002h
+L_avx512_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h
ptr_L_avx512_aes_gcm_two QWORD L_avx512_aes_gcm_two
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_three QWORD \
- 0000000000000000h, 0000000000000003h
+L_avx512_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h
ptr_L_avx512_aes_gcm_three QWORD L_avx512_aes_gcm_three
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_four QWORD \
- 0000000000000000h, 0000000000000004h
+L_avx512_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h
ptr_L_avx512_aes_gcm_four QWORD L_avx512_aes_gcm_four
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_five QWORD \
- 0000000000000000h, 0000000000000005h
+L_avx512_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h
ptr_L_avx512_aes_gcm_five QWORD L_avx512_aes_gcm_five
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_six QWORD \
- 0000000000000000h, 0000000000000006h
+L_avx512_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h
ptr_L_avx512_aes_gcm_six QWORD L_avx512_aes_gcm_six
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_seven QWORD \
- 0000000000000000h, 0000000000000007h
+L_avx512_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h
ptr_L_avx512_aes_gcm_seven QWORD L_avx512_aes_gcm_seven
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_eight QWORD \
- 0000000000000000h, 0000000000000008h
+L_avx512_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h
ptr_L_avx512_aes_gcm_eight QWORD L_avx512_aes_gcm_eight
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_bswap_epi64 QWORD \
- 0001020304050607h, 08090a0b0c0d0e0fh
+L_avx512_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh
ptr_L_avx512_aes_gcm_bswap_epi64 QWORD L_avx512_aes_gcm_bswap_epi64
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_bswap_mask QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_avx512_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_avx512_aes_gcm_bswap_mask QWORD L_avx512_aes_gcm_bswap_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_gcm_mod2_128 QWORD \
- 0000000000000001h, 0c200000000000000h
+L_avx512_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h
ptr_L_avx512_aes_gcm_mod2_128 QWORD L_avx512_aes_gcm_mod2_128
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -23692,16 +23617,16 @@ L_AES_GCM_encrypt_avx512_no_ext:
vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
- vbroadcasti32x4 zmm9, [r15]
- vbroadcasti32x4 zmm10, [r15+16]
- vbroadcasti32x4 zmm11, [r15+32]
- vbroadcasti32x4 zmm12, [r15+48]
- vbroadcasti32x4 zmm13, [r15+64]
- vbroadcasti32x4 zmm14, [r15+80]
- vbroadcasti32x4 zmm15, [r15+96]
- vbroadcasti32x4 zmm1, [r15+112]
- vbroadcasti32x4 zmm2, [r15+128]
- vbroadcasti32x4 zmm3, [r15+144]
+ vbroadcasti32x4 zmm9, OWORD PTR [r15]
+ vbroadcasti32x4 zmm10, OWORD PTR [r15+16]
+ vbroadcasti32x4 zmm11, OWORD PTR [r15+32]
+ vbroadcasti32x4 zmm12, OWORD PTR [r15+48]
+ vbroadcasti32x4 zmm13, OWORD PTR [r15+64]
+ vbroadcasti32x4 zmm14, OWORD PTR [r15+80]
+ vbroadcasti32x4 zmm15, OWORD PTR [r15+96]
+ vbroadcasti32x4 zmm1, OWORD PTR [r15+112]
+ vbroadcasti32x4 zmm2, OWORD PTR [r15+128]
+ vbroadcasti32x4 zmm3, OWORD PTR [r15+144]
cmp r9d, 512
jl L_AES_GCM_encrypt_avx512_no_windows
mov r13d, r9d
@@ -23733,7 +23658,7 @@ L_AES_GCM_encrypt_avx512_no_ext:
; 512 bytes of input
lea rcx, QWORD PTR [rsi+rbx]
mov QWORD PTR [rsp+1056], rcx
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -23786,30 +23711,30 @@ L_AES_GCM_encrypt_avx512_no_ext:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -23830,7 +23755,7 @@ L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add ebx, 256
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -23883,30 +23808,30 @@ L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -23935,7 +23860,7 @@ L_AES_GCM_encrypt_avx512_win_loop:
mov r12, QWORD PTR [rsp+1056]
vpxorq zmm21, zmm21, zmm21
vinserti32x4 zmm21, zmm21, xmm6, 0
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -24025,30 +23950,30 @@ L_AES_GCM_encrypt_avx512_win_loop:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_a_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -24069,7 +23994,7 @@ L_AES_GCM_encrypt_avx512_a_il_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add ebx, 256
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -24158,30 +24083,30 @@ L_AES_GCM_encrypt_avx512_a_il_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_b_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -24328,7 +24253,7 @@ L_AES_GCM_encrypt_avx512_no_windows:
cmp ebx, r13d
jge L_AES_GCM_encrypt_avx512_after_256
; 256 bytes of input
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -24381,30 +24306,30 @@ L_AES_GCM_encrypt_avx512_no_windows:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -24429,7 +24354,7 @@ L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last:
cmp ebx, r13d
jge L_AES_GCM_encrypt_avx512_last_ghash
L_AES_GCM_encrypt_avx512_ghash_128:
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -24482,30 +24407,30 @@ L_AES_GCM_encrypt_avx512_ghash_128:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -25882,16 +25807,16 @@ L_AES_GCM_decrypt_avx512_no_ext:
vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
- vbroadcasti32x4 zmm9, [r15]
- vbroadcasti32x4 zmm10, [r15+16]
- vbroadcasti32x4 zmm11, [r15+32]
- vbroadcasti32x4 zmm12, [r15+48]
- vbroadcasti32x4 zmm13, [r15+64]
- vbroadcasti32x4 zmm14, [r15+80]
- vbroadcasti32x4 zmm15, [r15+96]
- vbroadcasti32x4 zmm1, [r15+112]
- vbroadcasti32x4 zmm2, [r15+128]
- vbroadcasti32x4 zmm3, [r15+144]
+ vbroadcasti32x4 zmm9, OWORD PTR [r15]
+ vbroadcasti32x4 zmm10, OWORD PTR [r15+16]
+ vbroadcasti32x4 zmm11, OWORD PTR [r15+32]
+ vbroadcasti32x4 zmm12, OWORD PTR [r15+48]
+ vbroadcasti32x4 zmm13, OWORD PTR [r15+64]
+ vbroadcasti32x4 zmm14, OWORD PTR [r15+80]
+ vbroadcasti32x4 zmm15, OWORD PTR [r15+96]
+ vbroadcasti32x4 zmm1, OWORD PTR [r15+112]
+ vbroadcasti32x4 zmm2, OWORD PTR [r15+128]
+ vbroadcasti32x4 zmm3, OWORD PTR [r15+144]
cmp r9d, 512
jl L_AES_GCM_decrypt_avx512_no_windows
mov r13d, r9d
@@ -26024,7 +25949,7 @@ L_AES_GCM_decrypt_avx512_win_loop:
lea rax, QWORD PTR [rdi+rbx]
vpxorq zmm21, zmm21, zmm21
vinserti32x4 zmm21, zmm21, xmm6, 0
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -26114,30 +26039,30 @@ L_AES_GCM_decrypt_avx512_win_loop:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_decrypt_avx512_a_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -26158,7 +26083,7 @@ L_AES_GCM_decrypt_avx512_a_il_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add r12d, 256
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -26247,30 +26172,30 @@ L_AES_GCM_decrypt_avx512_a_il_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_decrypt_avx512_b_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -26307,7 +26232,7 @@ L_AES_GCM_decrypt_avx512_b_il_last:
cmp ebx, r13d
jl L_AES_GCM_decrypt_avx512_win_loop
L_AES_GCM_decrypt_avx512_last_aes:
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -26360,30 +26285,30 @@ L_AES_GCM_decrypt_avx512_last_aes:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -26404,7 +26329,7 @@ L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add r12d, 256
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -26457,30 +26382,30 @@ L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -26566,7 +26491,7 @@ L_AES_GCM_decrypt_avx512_no_windows:
vextracti32x4 xmm5, zmm29, 3
vpxorq xmm6, xmm29, xmm0
vpternlogq xmm6, xmm5, xmm4, 150
- vbroadcasti32x4 zmm20, [rsp+1024]
+ vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -26619,30 +26544,30 @@ L_AES_GCM_decrypt_avx512_no_windows:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r10d, 11
- vbroadcasti32x4 zmm20, [r15+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+160]
jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r10d, 13
- vbroadcasti32x4 zmm20, [r15+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+192]
jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [r15+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15+224]
L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -27943,16 +27868,16 @@ L_AES_GCM_encrypt_update_avx512_no_ext:
vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
- vbroadcasti32x4 zmm9, [rax]
- vbroadcasti32x4 zmm10, [rax+16]
- vbroadcasti32x4 zmm11, [rax+32]
- vbroadcasti32x4 zmm12, [rax+48]
- vbroadcasti32x4 zmm13, [rax+64]
- vbroadcasti32x4 zmm14, [rax+80]
- vbroadcasti32x4 zmm15, [rax+96]
- vbroadcasti32x4 zmm1, [rax+112]
- vbroadcasti32x4 zmm2, [rax+128]
- vbroadcasti32x4 zmm3, [rax+144]
+ vbroadcasti32x4 zmm9, OWORD PTR [rax]
+ vbroadcasti32x4 zmm10, OWORD PTR [rax+16]
+ vbroadcasti32x4 zmm11, OWORD PTR [rax+32]
+ vbroadcasti32x4 zmm12, OWORD PTR [rax+48]
+ vbroadcasti32x4 zmm13, OWORD PTR [rax+64]
+ vbroadcasti32x4 zmm14, OWORD PTR [rax+80]
+ vbroadcasti32x4 zmm15, OWORD PTR [rax+96]
+ vbroadcasti32x4 zmm1, OWORD PTR [rax+112]
+ vbroadcasti32x4 zmm2, OWORD PTR [rax+128]
+ vbroadcasti32x4 zmm3, OWORD PTR [rax+144]
cmp r9d, 512
jl L_AES_GCM_encrypt_update_avx512_no_windows
mov ebp, r9d
@@ -27983,7 +27908,7 @@ L_AES_GCM_encrypt_update_avx512_no_ext:
vmovdqu64 [rsp+960], zmm26
; 512 bytes of input
lea rsi, QWORD PTR [r10+rdi]
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28036,30 +27961,30 @@ L_AES_GCM_encrypt_update_avx512_no_ext:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -28080,7 +28005,7 @@ L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add edi, 256
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28133,30 +28058,30 @@ L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -28183,7 +28108,7 @@ L_AES_GCM_encrypt_update_avx512_win_loop:
lea rbx, QWORD PTR [r10+rdi]
vpxorq zmm21, zmm21, zmm21
vinserti32x4 zmm21, zmm21, xmm6, 0
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28273,30 +28198,30 @@ L_AES_GCM_encrypt_update_avx512_win_loop:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_a_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -28317,7 +28242,7 @@ L_AES_GCM_encrypt_update_avx512_a_il_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add edi, 256
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28406,30 +28331,30 @@ L_AES_GCM_encrypt_update_avx512_a_il_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_b_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -28574,7 +28499,7 @@ L_AES_GCM_encrypt_update_avx512_no_windows:
cmp edi, r13d
jge L_AES_GCM_encrypt_update_avx512_after_256
; 256 bytes of input
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28627,30 +28552,30 @@ L_AES_GCM_encrypt_update_avx512_no_windows:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -28675,7 +28600,7 @@ L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last:
cmp edi, r13d
jge L_AES_GCM_encrypt_update_avx512_last_ghash
L_AES_GCM_encrypt_update_avx512_ghash_128:
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -28728,30 +28653,30 @@ L_AES_GCM_encrypt_update_avx512_ghash_128:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -29686,16 +29611,16 @@ L_AES_GCM_decrypt_update_avx512_no_ext:
vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
- vbroadcasti32x4 zmm9, [rax]
- vbroadcasti32x4 zmm10, [rax+16]
- vbroadcasti32x4 zmm11, [rax+32]
- vbroadcasti32x4 zmm12, [rax+48]
- vbroadcasti32x4 zmm13, [rax+64]
- vbroadcasti32x4 zmm14, [rax+80]
- vbroadcasti32x4 zmm15, [rax+96]
- vbroadcasti32x4 zmm1, [rax+112]
- vbroadcasti32x4 zmm2, [rax+128]
- vbroadcasti32x4 zmm3, [rax+144]
+ vbroadcasti32x4 zmm9, OWORD PTR [rax]
+ vbroadcasti32x4 zmm10, OWORD PTR [rax+16]
+ vbroadcasti32x4 zmm11, OWORD PTR [rax+32]
+ vbroadcasti32x4 zmm12, OWORD PTR [rax+48]
+ vbroadcasti32x4 zmm13, OWORD PTR [rax+64]
+ vbroadcasti32x4 zmm14, OWORD PTR [rax+80]
+ vbroadcasti32x4 zmm15, OWORD PTR [rax+96]
+ vbroadcasti32x4 zmm1, OWORD PTR [rax+112]
+ vbroadcasti32x4 zmm2, OWORD PTR [rax+128]
+ vbroadcasti32x4 zmm3, OWORD PTR [rax+144]
cmp r9d, 512
jl L_AES_GCM_decrypt_update_avx512_no_windows
mov r13d, r9d
@@ -29828,7 +29753,7 @@ L_AES_GCM_decrypt_update_avx512_win_loop:
lea rbx, QWORD PTR [r11+rdi]
vpxorq zmm21, zmm21, zmm21
vinserti32x4 zmm21, zmm21, xmm6, 0
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -29918,30 +29843,30 @@ L_AES_GCM_decrypt_update_avx512_win_loop:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_avx512_a_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_avx512_a_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -29962,7 +29887,7 @@ L_AES_GCM_decrypt_update_avx512_a_il_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add esi, 256
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -30051,30 +29976,30 @@ L_AES_GCM_decrypt_update_avx512_a_il_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_avx512_b_il_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_avx512_b_il_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -30111,7 +30036,7 @@ L_AES_GCM_decrypt_update_avx512_b_il_last:
cmp edi, r13d
jl L_AES_GCM_decrypt_update_avx512_win_loop
L_AES_GCM_decrypt_update_avx512_last_aes:
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -30164,30 +30089,30 @@ L_AES_GCM_decrypt_update_avx512_last_aes:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -30208,7 +30133,7 @@ L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last:
vpxorq zmm19, zmm19, zmm21
vmovdqu64 [rdx+192], zmm19
add esi, 256
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -30261,30 +30186,30 @@ L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
@@ -30370,7 +30295,7 @@ L_AES_GCM_decrypt_update_avx512_no_windows:
vextracti32x4 xmm5, zmm29, 3
vpxorq xmm6, xmm29, xmm0
vpternlogq xmm6, xmm5, xmm4, 150
- vbroadcasti32x4 zmm20, [r15]
+ vbroadcasti32x4 zmm20, OWORD PTR [r15]
vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
vpshufb zmm16, zmm16, zmm22
vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
@@ -30423,30 +30348,30 @@ L_AES_GCM_decrypt_update_avx512_no_windows:
vaesenc zmm18, zmm18, zmm3
vaesenc zmm19, zmm19, zmm3
cmp r8d, 11
- vbroadcasti32x4 zmm20, [rax+160]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+160]
jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+176]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
cmp r8d, 13
- vbroadcasti32x4 zmm20, [rax+192]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+192]
jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+208]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+208]
vaesenc zmm16, zmm16, zmm20
vaesenc zmm17, zmm17, zmm20
vaesenc zmm18, zmm18, zmm20
vaesenc zmm19, zmm19, zmm20
- vbroadcasti32x4 zmm20, [rax+224]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+224]
L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last:
vaesenclast zmm16, zmm16, zmm20
vaesenclast zmm17, zmm17, zmm20
diff --git a/wolfcrypt/src/aes_gcm_x86_asm.asm b/wolfcrypt/src/aes_gcm_x86_asm.asm
new file mode 100644
index 00000000000..e5fe2d87eda
--- /dev/null
+++ b/wolfcrypt/src/aes_gcm_x86_asm.asm
@@ -0,0 +1,12921 @@
+; /* aes_gcm_x86_asm
+; *
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN32
+_WIN32 = 1
+ENDIF
+
+.686P
+.XMM
+.MODEL FLAT, C
+
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_one DWORD 00000000h, 00000000h, 00000001h, 00000000h
+ptr_L_aes_gcm_one QWORD L_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_two DWORD 00000000h, 00000000h, 00000002h, 00000000h
+ptr_L_aes_gcm_two QWORD L_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_three DWORD 00000000h, 00000000h, 00000003h, 00000000h
+ptr_L_aes_gcm_three QWORD L_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_four DWORD 00000000h, 00000000h, 00000004h, 00000000h
+ptr_L_aes_gcm_four QWORD L_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh
+ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h
+ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h
+ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_one DWORD 00000000h, 00000000h, 00000001h, 00000000h
+ptr_L_aes_gcm_avx1_one QWORD L_aes_gcm_avx1_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_two DWORD 00000000h, 00000000h, 00000002h, 00000000h
+ptr_L_aes_gcm_avx1_two QWORD L_aes_gcm_avx1_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_three DWORD 00000000h, 00000000h, 00000003h, 00000000h
+ptr_L_aes_gcm_avx1_three QWORD L_aes_gcm_avx1_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_four DWORD 00000000h, 00000000h, 00000004h, 00000000h
+ptr_L_aes_gcm_avx1_four QWORD L_aes_gcm_avx1_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh
+ptr_L_aes_gcm_avx1_bswap_epi64 QWORD L_aes_gcm_avx1_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h
+ptr_L_aes_gcm_avx1_bswap_mask QWORD L_aes_gcm_avx1_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx1_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h
+ptr_L_aes_gcm_avx1_mod2_128 QWORD L_aes_gcm_avx1_mod2_128
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_one DWORD 00000000h, 00000000h, 00000001h, 00000000h
+ptr_L_aes_gcm_avx2_one QWORD L_aes_gcm_avx2_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_two DWORD 00000000h, 00000000h, 00000002h, 00000000h
+ptr_L_aes_gcm_avx2_two QWORD L_aes_gcm_avx2_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_three DWORD 00000000h, 00000000h, 00000003h, 00000000h
+ptr_L_aes_gcm_avx2_three QWORD L_aes_gcm_avx2_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_four DWORD 00000000h, 00000000h, 00000004h, 00000000h
+ptr_L_aes_gcm_avx2_four QWORD L_aes_gcm_avx2_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_bswap_one DWORD 00000000h, 00000000h, 00000000h, 01000000h
+ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh
+ptr_L_aes_gcm_avx2_bswap_epi64 QWORD L_aes_gcm_avx2_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h
+ptr_L_aes_gcm_avx2_bswap_mask QWORD L_aes_gcm_avx2_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_avx2_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h
+ptr_L_aes_gcm_avx2_mod2_128 QWORD L_aes_gcm_avx2_mod2_128
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 112
+ mov esi, DWORD PTR [esp+144]
+ mov ebp, DWORD PTR [esp+168]
+ mov edx, DWORD PTR [esp+160]
+ pxor xmm0, xmm0
+ pxor xmm2, xmm2
+ cmp edx, 12
+ jne L_AES_GCM_encrypt_aesni_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ pinsrd xmm0, DWORD PTR [esi], 0
+ pinsrd xmm0, DWORD PTR [esi+4], 1
+ pinsrd xmm0, DWORD PTR [esi+8], 2
+ pinsrd xmm0, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ movdqa xmm5, xmm0
+ movdqa xmm1, OWORD PTR [ebp]
+ pxor xmm5, xmm1
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_12_last
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_12_last
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_calc_iv_12_last:
+ aesenclast xmm1, xmm3
+ aesenclast xmm5, xmm3
+ pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu OWORD PTR [esp+80], xmm5
+ jmp L_AES_GCM_encrypt_aesni_iv_done
+L_AES_GCM_encrypt_aesni_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ movdqa xmm1, OWORD PTR [ebp]
+ aesenc xmm1, [ebp+16]
+ aesenc xmm1, [ebp+32]
+ aesenc xmm1, [ebp+48]
+ aesenc xmm1, [ebp+64]
+ aesenc xmm1, [ebp+80]
+ aesenc xmm1, [ebp+96]
+ aesenc xmm1, [ebp+112]
+ aesenc xmm1, [ebp+128]
+ aesenc xmm1, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm1, xmm5
+ aesenc xmm1, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm1, xmm5
+ aesenc xmm1, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last:
+ aesenclast xmm1, xmm5
+ pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_encrypt_aesni_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_aesni_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_aesni_calc_iv_16_loop:
+ movdqu xmm4, OWORD PTR [esi+ecx]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_aesni_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+160]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_aesni_calc_iv_done
+L_AES_GCM_encrypt_aesni_calc_iv_lt16:
+ sub esp, 16
+ pxor xmm4, xmm4
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_aesni_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_aesni_calc_iv_loop
+ movdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+L_AES_GCM_encrypt_aesni_calc_iv_done:
+ ; T = Encrypt counter
+ pxor xmm4, xmm4
+ shl edx, 3
+ pinsrd xmm4, edx, 0
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ ; Encrypt counter
+ movdqa xmm4, OWORD PTR [ebp]
+ pxor xmm4, xmm0
+ aesenc xmm4, [ebp+16]
+ aesenc xmm4, [ebp+32]
+ aesenc xmm4, [ebp+48]
+ aesenc xmm4, [ebp+64]
+ aesenc xmm4, [ebp+80]
+ aesenc xmm4, [ebp+96]
+ aesenc xmm4, [ebp+112]
+ aesenc xmm4, [ebp+128]
+ aesenc xmm4, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last:
+ aesenclast xmm4, xmm5
+ movdqu OWORD PTR [esp+80], xmm4
+L_AES_GCM_encrypt_aesni_iv_done:
+ mov esi, DWORD PTR [esp+140]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+156]
+ cmp edx, 0
+ je L_AES_GCM_encrypt_aesni_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_aesni_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_aesni_calc_aad_16_loop:
+ movdqu xmm4, OWORD PTR [esi+ecx]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm2, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm2, 17
+ pclmulqdq xmm4, xmm2, 0
+ pxor xmm5, xmm2
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm2
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm2, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm2, xmm6
+ por xmm3, xmm4
+ por xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm2, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_aesni_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+156]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_aesni_calc_aad_done
+L_AES_GCM_encrypt_aesni_calc_aad_lt16:
+ sub esp, 16
+ pxor xmm4, xmm4
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_aesni_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_aesni_calc_aad_loop
+ movdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm2, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm2, 17
+ pclmulqdq xmm4, xmm2, 0
+ pxor xmm5, xmm2
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm2
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm2, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm2, xmm6
+ por xmm3, xmm4
+ por xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm2, xmm6
+L_AES_GCM_encrypt_aesni_calc_aad_done:
+ movdqu OWORD PTR [esp+96], xmm2
+ mov esi, DWORD PTR [esp+132]
+ mov edi, DWORD PTR [esp+136]
+ ; Calculate counter and H
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm1
+ paddd xmm0, OWORD PTR L_aes_gcm_one
+ movdqa xmm4, xmm1
+ movdqu OWORD PTR [esp+64], xmm0
+ psrlq xmm5, 63
+ psllq xmm4, 1
+ pslldq xmm5, 8
+ por xmm4, xmm5
+ pshufd xmm1, xmm1, 255
+ psrad xmm1, 31
+ pand xmm1, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm1, xmm4
+ xor ebx, ebx
+ mov eax, DWORD PTR [esp+152]
+ cmp eax, 64
+ jl L_AES_GCM_encrypt_aesni_done_64
+ and eax, 4294967232
+ movdqa xmm6, xmm2
+ ; H ^ 1
+ movdqu OWORD PTR [esp], xmm1
+ ; H ^ 2
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm0, xmm5
+ movdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm0, 78
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm0
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm3, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [esp+32], xmm3
+ ; H ^ 4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm0, 78
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm0
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm3, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [esp+48], xmm3
+ ; First 64 bytes of input
+ ; Encrypt 64 bytes of counter
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqu xmm3, xmm4
+ paddd xmm3, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm3
+ movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pshufb xmm4, xmm3
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pshufb xmm5, xmm3
+ paddd xmm6, OWORD PTR L_aes_gcm_two
+ pshufb xmm6, xmm3
+ paddd xmm7, OWORD PTR L_aes_gcm_three
+ pshufb xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp]
+ pxor xmm4, xmm3
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_enc_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_enc_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_enc_done:
+ aesenclast xmm4, xmm3
+ aesenclast xmm5, xmm3
+ movdqu xmm0, OWORD PTR [esi]
+ movdqu xmm1, OWORD PTR [esi+16]
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ movdqu OWORD PTR [edi], xmm4
+ movdqu OWORD PTR [edi+16], xmm5
+ aesenclast xmm6, xmm3
+ aesenclast xmm7, xmm3
+ movdqu xmm0, OWORD PTR [esi+32]
+ movdqu xmm1, OWORD PTR [esi+48]
+ pxor xmm6, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [edi+32], xmm6
+ movdqu OWORD PTR [edi+48], xmm7
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_aesni_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_aesni_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqu xmm3, xmm4
+ paddd xmm3, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm3
+ movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pshufb xmm4, xmm3
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pshufb xmm5, xmm3
+ paddd xmm6, OWORD PTR L_aes_gcm_two
+ pshufb xmm6, xmm3
+ paddd xmm7, OWORD PTR L_aes_gcm_three
+ pshufb xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp]
+ pxor xmm4, xmm3
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done:
+ aesenclast xmm4, xmm3
+ aesenclast xmm5, xmm3
+ movdqu xmm0, OWORD PTR [ecx]
+ movdqu xmm1, OWORD PTR [ecx+16]
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ movdqu OWORD PTR [edx], xmm4
+ movdqu OWORD PTR [edx+16], xmm5
+ aesenclast xmm6, xmm3
+ aesenclast xmm7, xmm3
+ movdqu xmm0, OWORD PTR [ecx+32]
+ movdqu xmm1, OWORD PTR [ecx+48]
+ pxor xmm6, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [edx+32], xmm6
+ movdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ movdqu xmm6, OWORD PTR [esp+96]
+ movdqu xmm3, OWORD PTR [esp+48]
+ movdqu xmm4, OWORD PTR [edx+-64]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm6
+ pshufd xmm5, xmm3, 78
+ pshufd xmm1, xmm4, 78
+ pxor xmm5, xmm3
+ pxor xmm1, xmm4
+ movdqa xmm7, xmm4
+ pclmulqdq xmm7, xmm3, 17
+ movdqa xmm6, xmm4
+ pclmulqdq xmm6, xmm3, 0
+ pclmulqdq xmm5, xmm1, 0
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqu xmm3, OWORD PTR [esp+32]
+ movdqu xmm4, OWORD PTR [edx+-48]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp+16]
+ movdqu xmm4, OWORD PTR [edx+-32]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp]
+ movdqu xmm4, OWORD PTR [edx+-16]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqa xmm1, xmm5
+ psrldq xmm5, 8
+ pslldq xmm1, 8
+ pxor xmm6, xmm1
+ pxor xmm7, xmm5
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ movdqa xmm1, xmm6
+ pslld xmm3, 31
+ pslld xmm0, 30
+ pslld xmm1, 25
+ pxor xmm3, xmm0
+ pxor xmm3, xmm1
+ movdqa xmm0, xmm3
+ pslldq xmm3, 12
+ psrldq xmm0, 4
+ pxor xmm6, xmm3
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm6
+ psrld xmm1, 1
+ psrld xmm5, 2
+ psrld xmm4, 7
+ pxor xmm1, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+ pxor xmm6, xmm1
+ pxor xmm6, xmm7
+ movdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_aesni_ghash_64
+L_AES_GCM_encrypt_aesni_end_64:
+ movdqu xmm2, OWORD PTR [esp+96]
+ ; Block 1
+ movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm1, OWORD PTR [edx]
+ pshufb xmm1, xmm4
+ movdqu xmm3, OWORD PTR [esp+48]
+ pxor xmm1, xmm2
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm3, 78
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm3
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm0, xmm6
+ pxor xmm2, xmm5
+ ; Block 2
+ movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm1, OWORD PTR [edx+16]
+ pshufb xmm1, xmm4
+ movdqu xmm3, OWORD PTR [esp+32]
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm3, 78
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm3
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ pxor xmm0, xmm4
+ pxor xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm0, xmm6
+ pxor xmm2, xmm5
+ ; Block 3
+ movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm1, OWORD PTR [edx+32]
+ pshufb xmm1, xmm4
+ movdqu xmm3, OWORD PTR [esp+16]
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm3, 78
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm3
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ pxor xmm0, xmm4
+ pxor xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm0, xmm6
+ pxor xmm2, xmm5
+ ; Block 4
+ movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm1, OWORD PTR [edx+48]
+ pshufb xmm1, xmm4
+ movdqu xmm3, OWORD PTR [esp]
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm3, 78
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm3
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ pxor xmm0, xmm4
+ pxor xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm0, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm0, xmm4
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm0
+ pxor xmm2, xmm6
+ movdqu xmm1, OWORD PTR [esp]
+L_AES_GCM_encrypt_aesni_done_64:
+ mov edx, DWORD PTR [esp+152]
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_aesni_done_enc
+ mov eax, DWORD PTR [esp+152]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_aesni_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqa xmm5, xmm4
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pxor xmm4, [ebp]
+ movdqu OWORD PTR [esp+64], xmm5
+ aesenc xmm4, [ebp+16]
+ aesenc xmm4, [ebp+32]
+ aesenc xmm4, [ebp+48]
+ aesenc xmm4, [ebp+64]
+ aesenc xmm4, [ebp+80]
+ aesenc xmm4, [ebp+96]
+ aesenc xmm4, [ebp+112]
+ aesenc xmm4, [ebp+128]
+ aesenc xmm4, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last:
+ aesenclast xmm4, xmm5
+ movdqu xmm5, OWORD PTR [ecx]
+ pxor xmm4, xmm5
+ movdqu OWORD PTR [edx], xmm4
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_aesni_last_block_ghash
+L_AES_GCM_encrypt_aesni_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqa xmm5, xmm4
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pxor xmm4, [ebp]
+ movdqu OWORD PTR [esp+64], xmm5
+ movdqu xmm0, xmm2
+ pclmulqdq xmm0, xmm1, 16
+ aesenc xmm4, [ebp+16]
+ aesenc xmm4, [ebp+32]
+ movdqu xmm3, xmm2
+ pclmulqdq xmm3, xmm1, 1
+ aesenc xmm4, [ebp+48]
+ aesenc xmm4, [ebp+64]
+ aesenc xmm4, [ebp+80]
+ movdqu xmm5, xmm2
+ pclmulqdq xmm5, xmm1, 17
+ aesenc xmm4, [ebp+96]
+ pxor xmm0, xmm3
+ movdqa xmm6, xmm0
+ psrldq xmm0, 8
+ pslldq xmm6, 8
+ aesenc xmm4, [ebp+112]
+ movdqu xmm3, xmm2
+ pclmulqdq xmm3, xmm1, 0
+ pxor xmm6, xmm3
+ pxor xmm5, xmm0
+ movdqa xmm7, OWORD PTR L_aes_gcm_mod2_128
+ movdqa xmm3, xmm6
+ pclmulqdq xmm3, xmm7, 16
+ aesenc xmm4, [ebp+128]
+ pshufd xmm0, xmm6, 78
+ pxor xmm0, xmm3
+ movdqa xmm3, xmm0
+ pclmulqdq xmm3, xmm7, 16
+ aesenc xmm4, [ebp+144]
+ pshufd xmm2, xmm0, 78
+ pxor xmm2, xmm3
+ pxor xmm2, xmm5
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_aesenc_gfmul_last:
+ aesenclast xmm4, xmm5
+ movdqu xmm5, OWORD PTR [ecx]
+ pxor xmm4, xmm5
+ movdqu OWORD PTR [edx], xmm4
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_aesni_last_block_start
+L_AES_GCM_encrypt_aesni_last_block_ghash:
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm2, 78
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+L_AES_GCM_encrypt_aesni_last_block_done:
+ mov ecx, DWORD PTR [esp+152]
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done
+ movdqu xmm0, OWORD PTR [esp+64]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ pxor xmm0, [ebp]
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ aesenc xmm0, [ebp+96]
+ aesenc xmm0, [ebp+112]
+ aesenc xmm0, [ebp+128]
+ aesenc xmm0, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last
+ aesenc xmm0, xmm5
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last
+ aesenc xmm0, xmm5
+ aesenc xmm0, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last:
+ aesenclast xmm0, xmm5
+ sub esp, 16
+ xor ecx, ecx
+ movdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ebx], al
+ mov BYTE PTR [esp+ecx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop
+ xor eax, eax
+ cmp ecx, 16
+ je L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop:
+ mov BYTE PTR [esp+ecx], al
+ inc ecx
+ cmp ecx, 16
+ jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc:
+ movdqu xmm0, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm0
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm2, 78
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_aesni_done_enc:
+ mov edi, DWORD PTR [esp+148]
+ mov ebx, DWORD PTR [esp+164]
+ mov edx, DWORD PTR [esp+152]
+ mov ecx, DWORD PTR [esp+156]
+ shl edx, 3
+ shl ecx, 3
+ pinsrd xmm4, edx, 0
+ pinsrd xmm4, ecx, 2
+ mov edx, DWORD PTR [esp+152]
+ mov ecx, DWORD PTR [esp+156]
+ shr edx, 29
+ shr ecx, 29
+ pinsrd xmm4, edx, 1
+ pinsrd xmm4, ecx, 3
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm2, 78
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+ pshufb xmm2, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm4, OWORD PTR [esp+80]
+ pxor xmm4, xmm2
+ cmp ebx, 16
+ je L_AES_GCM_encrypt_aesni_store_tag_16
+ xor ecx, ecx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_aesni_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, ebx
+ jne L_AES_GCM_encrypt_aesni_store_tag_loop
+ jmp L_AES_GCM_encrypt_aesni_store_tag_done
+L_AES_GCM_encrypt_aesni_store_tag_16:
+ movdqu OWORD PTR [edi], xmm4
+L_AES_GCM_encrypt_aesni_store_tag_done:
+ add esp, 112
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 176
+ mov esi, DWORD PTR [esp+208]
+ mov ebp, DWORD PTR [esp+232]
+ mov edx, DWORD PTR [esp+224]
+ pxor xmm0, xmm0
+ pxor xmm2, xmm2
+ cmp edx, 12
+ jne L_AES_GCM_decrypt_aesni_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ pinsrd xmm0, DWORD PTR [esi], 0
+ pinsrd xmm0, DWORD PTR [esi+4], 1
+ pinsrd xmm0, DWORD PTR [esi+8], 2
+ pinsrd xmm0, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ movdqa xmm5, xmm0
+ movdqa xmm1, OWORD PTR [ebp]
+ pxor xmm5, xmm1
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_12_last
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_12_last
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm1, xmm3
+ aesenc xmm5, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_calc_iv_12_last:
+ aesenclast xmm1, xmm3
+ aesenclast xmm5, xmm3
+ pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu OWORD PTR [esp+80], xmm5
+ jmp L_AES_GCM_decrypt_aesni_iv_done
+L_AES_GCM_decrypt_aesni_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ movdqa xmm1, OWORD PTR [ebp]
+ aesenc xmm1, [ebp+16]
+ aesenc xmm1, [ebp+32]
+ aesenc xmm1, [ebp+48]
+ aesenc xmm1, [ebp+64]
+ aesenc xmm1, [ebp+80]
+ aesenc xmm1, [ebp+96]
+ aesenc xmm1, [ebp+112]
+ aesenc xmm1, [ebp+128]
+ aesenc xmm1, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm1, xmm5
+ aesenc xmm1, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm1, xmm5
+ aesenc xmm1, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last:
+ aesenclast xmm1, xmm5
+ pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_decrypt_aesni_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_aesni_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_aesni_calc_iv_16_loop:
+ movdqu xmm4, OWORD PTR [esi+ecx]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_aesni_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+224]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_aesni_calc_iv_done
+L_AES_GCM_decrypt_aesni_calc_iv_lt16:
+ sub esp, 16
+ pxor xmm4, xmm4
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_aesni_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_aesni_calc_iv_loop
+ movdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+L_AES_GCM_decrypt_aesni_calc_iv_done:
+ ; T = Encrypt counter
+ pxor xmm4, xmm4
+ shl edx, 3
+ pinsrd xmm4, edx, 0
+ pxor xmm0, xmm4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm0
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm0, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm0, xmm6
+ por xmm3, xmm4
+ por xmm0, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm0, xmm6
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ ; Encrypt counter
+ movdqa xmm4, OWORD PTR [ebp]
+ pxor xmm4, xmm0
+ aesenc xmm4, [ebp+16]
+ aesenc xmm4, [ebp+32]
+ aesenc xmm4, [ebp+48]
+ aesenc xmm4, [ebp+64]
+ aesenc xmm4, [ebp+80]
+ aesenc xmm4, [ebp+96]
+ aesenc xmm4, [ebp+112]
+ aesenc xmm4, [ebp+128]
+ aesenc xmm4, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last:
+ aesenclast xmm4, xmm5
+ movdqu OWORD PTR [esp+80], xmm4
+L_AES_GCM_decrypt_aesni_iv_done:
+ mov esi, DWORD PTR [esp+204]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+220]
+ cmp edx, 0
+ je L_AES_GCM_decrypt_aesni_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_aesni_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_aesni_calc_aad_16_loop:
+ movdqu xmm4, OWORD PTR [esi+ecx]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm2, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm2, 17
+ pclmulqdq xmm4, xmm2, 0
+ pxor xmm5, xmm2
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm2
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm2, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm2, xmm6
+ por xmm3, xmm4
+ por xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm2, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_aesni_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+220]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_aesni_calc_aad_done
+L_AES_GCM_decrypt_aesni_calc_aad_lt16:
+ sub esp, 16
+ pxor xmm4, xmm4
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_aesni_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_aesni_calc_aad_loop
+ movdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm2, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm2, 17
+ pclmulqdq xmm4, xmm2, 0
+ pxor xmm5, xmm2
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm4
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm3, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm2
+ psrld xmm4, 31
+ psrld xmm5, 31
+ pslld xmm3, 1
+ pslld xmm2, 1
+ movdqa xmm6, xmm4
+ pslldq xmm4, 4
+ psrldq xmm6, 12
+ pslldq xmm5, 4
+ por xmm2, xmm6
+ por xmm3, xmm4
+ por xmm2, xmm5
+ movdqa xmm4, xmm3
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm3
+ pslld xmm4, 31
+ pslld xmm5, 30
+ pslld xmm6, 25
+ pxor xmm4, xmm5
+ pxor xmm4, xmm6
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ pslldq xmm4, 12
+ pxor xmm3, xmm4
+ movdqa xmm6, xmm3
+ movdqa xmm7, xmm3
+ movdqa xmm4, xmm3
+ psrld xmm6, 1
+ psrld xmm7, 2
+ psrld xmm4, 7
+ pxor xmm6, xmm7
+ pxor xmm6, xmm4
+ pxor xmm6, xmm5
+ pxor xmm6, xmm3
+ pxor xmm2, xmm6
+L_AES_GCM_decrypt_aesni_calc_aad_done:
+ movdqu OWORD PTR [esp+96], xmm2
+ mov esi, DWORD PTR [esp+196]
+ mov edi, DWORD PTR [esp+200]
+ ; Calculate counter and H
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm1
+ paddd xmm0, OWORD PTR L_aes_gcm_one
+ movdqa xmm4, xmm1
+ movdqu OWORD PTR [esp+64], xmm0
+ psrlq xmm5, 63
+ psllq xmm4, 1
+ pslldq xmm5, 8
+ por xmm4, xmm5
+ pshufd xmm1, xmm1, 255
+ psrad xmm1, 31
+ pand xmm1, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm1, xmm4
+ xor ebx, ebx
+ cmp DWORD PTR [esp+216], 64
+ mov eax, DWORD PTR [esp+216]
+ jl L_AES_GCM_decrypt_aesni_done_64
+ and eax, 4294967232
+ movdqa xmm6, xmm2
+ ; H ^ 1
+ movdqu OWORD PTR [esp], xmm1
+ ; H ^ 2
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm1, 78
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm1
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm1
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm0, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm0, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm0, xmm5
+ movdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm0, 78
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm0
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm3, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [esp+32], xmm3
+ ; H ^ 4
+ pshufd xmm5, xmm0, 78
+ pshufd xmm6, xmm0, 78
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ pclmulqdq xmm7, xmm0, 17
+ pclmulqdq xmm4, xmm0, 0
+ pxor xmm5, xmm0
+ pxor xmm6, xmm0
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm3, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm3, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [esp+48], xmm3
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_aesni_ghash_64
+L_AES_GCM_decrypt_aesni_ghash_64_inplace:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqu xmm3, xmm4
+ paddd xmm3, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm3
+ movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pshufb xmm4, xmm3
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pshufb xmm5, xmm3
+ paddd xmm6, OWORD PTR L_aes_gcm_two
+ pshufb xmm6, xmm3
+ paddd xmm7, OWORD PTR L_aes_gcm_three
+ pshufb xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp]
+ pxor xmm4, xmm3
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done:
+ aesenclast xmm4, xmm3
+ aesenclast xmm5, xmm3
+ movdqu xmm0, OWORD PTR [ecx]
+ movdqu xmm1, OWORD PTR [ecx+16]
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ movdqu OWORD PTR [esp+112], xmm0
+ movdqu OWORD PTR [esp+128], xmm1
+ movdqu OWORD PTR [edx], xmm4
+ movdqu OWORD PTR [edx+16], xmm5
+ aesenclast xmm6, xmm3
+ aesenclast xmm7, xmm3
+ movdqu xmm0, OWORD PTR [ecx+32]
+ movdqu xmm1, OWORD PTR [ecx+48]
+ pxor xmm6, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [esp+144], xmm0
+ movdqu OWORD PTR [esp+160], xmm1
+ movdqu OWORD PTR [edx+32], xmm6
+ movdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ movdqu xmm6, OWORD PTR [esp+96]
+ movdqu xmm3, OWORD PTR [esp+48]
+ movdqu xmm4, OWORD PTR [esp+112]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm6
+ pshufd xmm5, xmm3, 78
+ pshufd xmm1, xmm4, 78
+ pxor xmm5, xmm3
+ pxor xmm1, xmm4
+ movdqa xmm7, xmm4
+ pclmulqdq xmm7, xmm3, 17
+ movdqa xmm6, xmm4
+ pclmulqdq xmm6, xmm3, 0
+ pclmulqdq xmm5, xmm1, 0
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqu xmm3, OWORD PTR [esp+32]
+ movdqu xmm4, OWORD PTR [esp+128]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp+16]
+ movdqu xmm4, OWORD PTR [esp+144]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp]
+ movdqu xmm4, OWORD PTR [esp+160]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqa xmm1, xmm5
+ psrldq xmm5, 8
+ pslldq xmm1, 8
+ pxor xmm6, xmm1
+ pxor xmm7, xmm5
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ movdqa xmm1, xmm6
+ pslld xmm3, 31
+ pslld xmm0, 30
+ pslld xmm1, 25
+ pxor xmm3, xmm0
+ pxor xmm3, xmm1
+ movdqa xmm0, xmm3
+ pslldq xmm3, 12
+ psrldq xmm0, 4
+ pxor xmm6, xmm3
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm6
+ psrld xmm1, 1
+ psrld xmm5, 2
+ psrld xmm4, 7
+ pxor xmm1, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+ pxor xmm6, xmm1
+ pxor xmm6, xmm7
+ movdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_aesni_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_aesni_ghash_64_done
+L_AES_GCM_decrypt_aesni_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqu xmm3, xmm4
+ paddd xmm3, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm3
+ movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pshufb xmm4, xmm3
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pshufb xmm5, xmm3
+ paddd xmm6, OWORD PTR L_aes_gcm_two
+ pshufb xmm6, xmm3
+ paddd xmm7, OWORD PTR L_aes_gcm_three
+ pshufb xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp]
+ pxor xmm4, xmm3
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+16]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+32]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+48]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+64]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+80]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+96]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+112]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+128]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+144]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+176]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+208]
+ aesenc xmm4, xmm3
+ aesenc xmm5, xmm3
+ aesenc xmm6, xmm3
+ aesenc xmm7, xmm3
+ movdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
+ aesenclast xmm4, xmm3
+ aesenclast xmm5, xmm3
+ movdqu xmm0, OWORD PTR [ecx]
+ movdqu xmm1, OWORD PTR [ecx+16]
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ movdqu OWORD PTR [edx], xmm4
+ movdqu OWORD PTR [edx+16], xmm5
+ aesenclast xmm6, xmm3
+ aesenclast xmm7, xmm3
+ movdqu xmm0, OWORD PTR [ecx+32]
+ movdqu xmm1, OWORD PTR [ecx+48]
+ pxor xmm6, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [edx+32], xmm6
+ movdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ movdqu xmm6, OWORD PTR [esp+96]
+ movdqu xmm3, OWORD PTR [esp+48]
+ movdqu xmm4, OWORD PTR [ecx]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm6
+ pshufd xmm5, xmm3, 78
+ pshufd xmm1, xmm4, 78
+ pxor xmm5, xmm3
+ pxor xmm1, xmm4
+ movdqa xmm7, xmm4
+ pclmulqdq xmm7, xmm3, 17
+ movdqa xmm6, xmm4
+ pclmulqdq xmm6, xmm3, 0
+ pclmulqdq xmm5, xmm1, 0
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqu xmm3, OWORD PTR [esp+32]
+ movdqu xmm4, OWORD PTR [ecx+16]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp+16]
+ movdqu xmm4, OWORD PTR [ecx+32]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqu xmm3, OWORD PTR [esp]
+ movdqu xmm4, OWORD PTR [ecx+48]
+ pshufd xmm0, xmm3, 78
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm3
+ pshufd xmm1, xmm4, 78
+ pxor xmm1, xmm4
+ movdqa xmm2, xmm4
+ pclmulqdq xmm2, xmm3, 17
+ pclmulqdq xmm3, xmm4, 0
+ pclmulqdq xmm0, xmm1, 0
+ pxor xmm5, xmm3
+ pxor xmm6, xmm3
+ pxor xmm5, xmm2
+ pxor xmm7, xmm2
+ pxor xmm5, xmm0
+ movdqa xmm1, xmm5
+ psrldq xmm5, 8
+ pslldq xmm1, 8
+ pxor xmm6, xmm1
+ pxor xmm7, xmm5
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ movdqa xmm1, xmm6
+ pslld xmm3, 31
+ pslld xmm0, 30
+ pslld xmm1, 25
+ pxor xmm3, xmm0
+ pxor xmm3, xmm1
+ movdqa xmm0, xmm3
+ pslldq xmm3, 12
+ psrldq xmm0, 4
+ pxor xmm6, xmm3
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm6
+ psrld xmm1, 1
+ psrld xmm5, 2
+ psrld xmm4, 7
+ pxor xmm1, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm0
+ pxor xmm6, xmm1
+ pxor xmm6, xmm7
+ movdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_aesni_ghash_64
+L_AES_GCM_decrypt_aesni_ghash_64_done:
+ movdqa xmm2, xmm6
+ movdqu xmm1, OWORD PTR [esp]
+L_AES_GCM_decrypt_aesni_done_64:
+ mov edx, DWORD PTR [esp+216]
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_aesni_done_dec
+ mov eax, DWORD PTR [esp+216]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_aesni_last_block_done
+L_AES_GCM_decrypt_aesni_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm5, OWORD PTR [ecx]
+ pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm5, xmm2
+ movdqu OWORD PTR [esp], xmm5
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqa xmm5, xmm4
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm5, OWORD PTR L_aes_gcm_one
+ pxor xmm4, [ebp]
+ movdqu OWORD PTR [esp+64], xmm5
+ movdqu xmm0, OWORD PTR [esp]
+ pclmulqdq xmm0, xmm1, 16
+ aesenc xmm4, [ebp+16]
+ aesenc xmm4, [ebp+32]
+ movdqu xmm3, OWORD PTR [esp]
+ pclmulqdq xmm3, xmm1, 1
+ aesenc xmm4, [ebp+48]
+ aesenc xmm4, [ebp+64]
+ aesenc xmm4, [ebp+80]
+ movdqu xmm5, OWORD PTR [esp]
+ pclmulqdq xmm5, xmm1, 17
+ aesenc xmm4, [ebp+96]
+ pxor xmm0, xmm3
+ movdqa xmm6, xmm0
+ psrldq xmm0, 8
+ pslldq xmm6, 8
+ aesenc xmm4, [ebp+112]
+ movdqu xmm3, OWORD PTR [esp]
+ pclmulqdq xmm3, xmm1, 0
+ pxor xmm6, xmm3
+ pxor xmm5, xmm0
+ movdqa xmm7, OWORD PTR L_aes_gcm_mod2_128
+ movdqa xmm3, xmm6
+ pclmulqdq xmm3, xmm7, 16
+ aesenc xmm4, [ebp+128]
+ pshufd xmm0, xmm6, 78
+ pxor xmm0, xmm3
+ movdqa xmm3, xmm0
+ pclmulqdq xmm3, xmm7, 16
+ aesenc xmm4, [ebp+144]
+ pshufd xmm2, xmm0, 78
+ pxor xmm2, xmm3
+ pxor xmm2, xmm5
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last
+ aesenc xmm4, xmm5
+ aesenc xmm4, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_aesenc_gfmul_last:
+ aesenclast xmm4, xmm5
+ movdqu xmm5, OWORD PTR [ecx]
+ pxor xmm4, xmm5
+ movdqu OWORD PTR [edx], xmm4
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_aesni_last_block_start
+L_AES_GCM_decrypt_aesni_last_block_done:
+ mov ecx, DWORD PTR [esp+216]
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done
+ movdqu xmm0, OWORD PTR [esp+64]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ pxor xmm0, [ebp]
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ aesenc xmm0, [ebp+96]
+ aesenc xmm0, [ebp+112]
+ aesenc xmm0, [ebp+128]
+ aesenc xmm0, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ movdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last
+ aesenc xmm0, xmm5
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ movdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last
+ aesenc xmm0, xmm5
+ aesenc xmm0, [ebp+208]
+ movdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last:
+ aesenclast xmm0, xmm5
+ sub esp, 32
+ xor ecx, ecx
+ movdqu OWORD PTR [esp], xmm0
+ pxor xmm4, xmm4
+ movdqu OWORD PTR [esp+16], xmm4
+L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ mov BYTE PTR [esp+ecx+16], al
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ebx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop
+ movdqu xmm0, OWORD PTR [esp+16]
+ add esp, 32
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm2, xmm0
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm2, 78
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_aesni_done_dec:
+ mov esi, DWORD PTR [esp+212]
+ mov ebp, DWORD PTR [esp+228]
+ mov edx, DWORD PTR [esp+216]
+ mov ecx, DWORD PTR [esp+220]
+ shl edx, 3
+ shl ecx, 3
+ pinsrd xmm4, edx, 0
+ pinsrd xmm4, ecx, 2
+ mov edx, DWORD PTR [esp+216]
+ mov ecx, DWORD PTR [esp+220]
+ shr edx, 29
+ shr ecx, 29
+ pinsrd xmm4, edx, 1
+ pinsrd xmm4, ecx, 3
+ pxor xmm2, xmm4
+ pshufd xmm5, xmm1, 78
+ pshufd xmm6, xmm2, 78
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ pclmulqdq xmm7, xmm1, 17
+ pclmulqdq xmm4, xmm1, 0
+ pxor xmm5, xmm1
+ pxor xmm6, xmm2
+ pclmulqdq xmm5, xmm6, 0
+ pxor xmm5, xmm4
+ pxor xmm5, xmm7
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm7
+ pslldq xmm6, 8
+ psrldq xmm5, 8
+ pxor xmm4, xmm6
+ pxor xmm2, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ pslld xmm5, 31
+ pslld xmm6, 30
+ pslld xmm7, 25
+ pxor xmm5, xmm6
+ pxor xmm5, xmm7
+ movdqa xmm7, xmm5
+ psrldq xmm7, 4
+ pslldq xmm5, 12
+ pxor xmm4, xmm5
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ psrld xmm5, 1
+ psrld xmm6, 2
+ pxor xmm5, xmm6
+ pxor xmm5, xmm4
+ psrld xmm4, 7
+ pxor xmm5, xmm7
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+ pshufb xmm2, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm4, OWORD PTR [esp+80]
+ pxor xmm4, xmm2
+ mov edi, DWORD PTR [esp+240]
+ cmp ebp, 16
+ je L_AES_GCM_decrypt_aesni_cmp_tag_16
+ sub esp, 16
+ xor ecx, ecx
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_aesni_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ xor al, BYTE PTR [esi+ecx]
+ or bl, al
+ inc ecx
+ cmp ecx, ebp
+ jne L_AES_GCM_decrypt_aesni_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add esp, 16
+ xor ecx, ecx
+ jmp L_AES_GCM_decrypt_aesni_cmp_tag_done
+L_AES_GCM_decrypt_aesni_cmp_tag_16:
+ movdqu xmm5, OWORD PTR [esi]
+ pcmpeqb xmm4, xmm5
+ pmovmskb edx, xmm4
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_aesni_cmp_tag_done:
+ mov DWORD PTR [edi], ebx
+ add esp, 176
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_aesni ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_AESGCM_STREAM
+_TEXT SEGMENT READONLY PARA
+AES_GCM_init_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+36]
+ mov esi, DWORD PTR [esp+44]
+ mov edi, DWORD PTR [esp+60]
+ pxor xmm4, xmm4
+ mov edx, DWORD PTR [esp+48]
+ cmp edx, 12
+ jne L_AES_GCM_init_aesni_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ pinsrd xmm4, DWORD PTR [esi], 0
+ pinsrd xmm4, DWORD PTR [esi+4], 1
+ pinsrd xmm4, DWORD PTR [esi+8], 2
+ pinsrd xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ movdqa xmm1, xmm4
+ movdqa xmm5, OWORD PTR [ebp]
+ pxor xmm1, xmm5
+ movdqa xmm7, OWORD PTR [ebp+16]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+32]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+48]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+64]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+80]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+96]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+112]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+128]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+144]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ cmp DWORD PTR [esp+40], 11
+ movdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_aesni_calc_iv_12_last
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+176]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ cmp DWORD PTR [esp+40], 13
+ movdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_aesni_calc_iv_12_last
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+208]
+ aesenc xmm5, xmm7
+ aesenc xmm1, xmm7
+ movdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_init_aesni_calc_iv_12_last:
+ aesenclast xmm5, xmm7
+ aesenclast xmm1, xmm7
+ pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu OWORD PTR [edi], xmm1
+ jmp L_AES_GCM_init_aesni_iv_done
+L_AES_GCM_init_aesni_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ movdqa xmm5, OWORD PTR [ebp]
+ aesenc xmm5, [ebp+16]
+ aesenc xmm5, [ebp+32]
+ aesenc xmm5, [ebp+48]
+ aesenc xmm5, [ebp+64]
+ aesenc xmm5, [ebp+80]
+ aesenc xmm5, [ebp+96]
+ aesenc xmm5, [ebp+112]
+ aesenc xmm5, [ebp+128]
+ aesenc xmm5, [ebp+144]
+ cmp DWORD PTR [esp+40], 11
+ movdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm5, xmm1
+ aesenc xmm5, [ebp+176]
+ cmp DWORD PTR [esp+40], 13
+ movdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+ aesenc xmm5, xmm1
+ aesenc xmm5, [ebp+208]
+ movdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
+ aesenclast xmm5, xmm1
+ pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_init_aesni_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_init_aesni_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_init_aesni_calc_iv_16_loop:
+ movdqu xmm0, OWORD PTR [esi+ecx]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm0
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm7, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm4
+ psrld xmm0, 31
+ psrld xmm1, 31
+ pslld xmm7, 1
+ pslld xmm4, 1
+ movdqa xmm2, xmm0
+ pslldq xmm0, 4
+ psrldq xmm2, 12
+ pslldq xmm1, 4
+ por xmm4, xmm2
+ por xmm7, xmm0
+ por xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm7
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm7, xmm0
+ movdqa xmm2, xmm7
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm7
+ pxor xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_init_aesni_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+48]
+ cmp ecx, edx
+ je L_AES_GCM_init_aesni_calc_iv_done
+L_AES_GCM_init_aesni_calc_iv_lt16:
+ sub esp, 16
+ pxor xmm0, xmm0
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm0
+L_AES_GCM_init_aesni_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_init_aesni_calc_iv_loop
+ movdqu xmm0, OWORD PTR [esp]
+ add esp, 16
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm0
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm7, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm4
+ psrld xmm0, 31
+ psrld xmm1, 31
+ pslld xmm7, 1
+ pslld xmm4, 1
+ movdqa xmm2, xmm0
+ pslldq xmm0, 4
+ psrldq xmm2, 12
+ pslldq xmm1, 4
+ por xmm4, xmm2
+ por xmm7, xmm0
+ por xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm7
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm7, xmm0
+ movdqa xmm2, xmm7
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm7
+ pxor xmm4, xmm2
+L_AES_GCM_init_aesni_calc_iv_done:
+ ; T = Encrypt counter
+ pxor xmm0, xmm0
+ shl edx, 3
+ pinsrd xmm0, edx, 0
+ pxor xmm4, xmm0
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm7, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm4
+ psrld xmm0, 31
+ psrld xmm1, 31
+ pslld xmm7, 1
+ pslld xmm4, 1
+ movdqa xmm2, xmm0
+ pslldq xmm0, 4
+ psrldq xmm2, 12
+ pslldq xmm1, 4
+ por xmm4, xmm2
+ por xmm7, xmm0
+ por xmm4, xmm1
+ movdqa xmm0, xmm7
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm7
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm7, xmm0
+ movdqa xmm2, xmm7
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm7
+ pxor xmm4, xmm2
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ ; Encrypt counter
+ movdqa xmm0, OWORD PTR [ebp]
+ pxor xmm0, xmm4
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ aesenc xmm0, [ebp+96]
+ aesenc xmm0, [ebp+112]
+ aesenc xmm0, [ebp+128]
+ aesenc xmm0, [ebp+144]
+ cmp DWORD PTR [esp+40], 11
+ movdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+40], 13
+ movdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+208]
+ movdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
+ aesenclast xmm0, xmm1
+ movdqu OWORD PTR [edi], xmm0
+L_AES_GCM_init_aesni_iv_done:
+ mov ebp, DWORD PTR [esp+52]
+ mov edi, DWORD PTR [esp+56]
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm4, OWORD PTR L_aes_gcm_one
+ movdqa OWORD PTR [ebp], xmm5
+ movdqa OWORD PTR [edi], xmm4
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_init_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_aad_update_aesni PROC
+ push esi
+ push edi
+ mov esi, DWORD PTR [esp+12]
+ mov edx, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov eax, DWORD PTR [esp+24]
+ movdqa xmm5, OWORD PTR [edi]
+ movdqa xmm6, OWORD PTR [eax]
+ xor ecx, ecx
+L_AES_GCM_aad_update_aesni_16_loop:
+ movdqu xmm0, OWORD PTR [esi+ecx]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm5, xmm0
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm6, 78
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm4, xmm2
+ pxor xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ psrld xmm0, 31
+ psrld xmm1, 31
+ pslld xmm4, 1
+ pslld xmm5, 1
+ movdqa xmm2, xmm0
+ pslldq xmm0, 4
+ psrldq xmm2, 12
+ pslldq xmm1, 4
+ por xmm5, xmm2
+ por xmm4, xmm0
+ por xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm4
+ movdqa xmm2, xmm4
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm4, xmm0
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm4
+ pxor xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_aad_update_aesni_16_loop
+ movdqa OWORD PTR [edi], xmm5
+ pop edi
+ pop esi
+ ret
+AES_GCM_aad_update_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_block_aesni PROC
+ push esi
+ push edi
+ mov ecx, DWORD PTR [esp+12]
+ mov eax, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov esi, DWORD PTR [esp+24]
+ mov edx, DWORD PTR [esp+28]
+ movdqu xmm0, OWORD PTR [edx]
+ movdqa xmm1, xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pxor xmm0, [ecx]
+ movdqu OWORD PTR [edx], xmm1
+ aesenc xmm0, [ecx+16]
+ aesenc xmm0, [ecx+32]
+ aesenc xmm0, [ecx+48]
+ aesenc xmm0, [ecx+64]
+ aesenc xmm0, [ecx+80]
+ aesenc xmm0, [ecx+96]
+ aesenc xmm0, [ecx+112]
+ aesenc xmm0, [ecx+128]
+ aesenc xmm0, [ecx+144]
+ cmp eax, 11
+ movdqa xmm1, OWORD PTR [ecx+160]
+ jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ecx+176]
+ cmp eax, 13
+ movdqa xmm1, OWORD PTR [ecx+192]
+ jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ecx+208]
+ movdqa xmm1, OWORD PTR [ecx+224]
+L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
+ aesenclast xmm0, xmm1
+ movdqu xmm1, OWORD PTR [esi]
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [edi], xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_block_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_ghash_block_aesni PROC
+ mov edx, DWORD PTR [esp+4]
+ mov eax, DWORD PTR [esp+8]
+ mov ecx, DWORD PTR [esp+12]
+ movdqa xmm4, OWORD PTR [eax]
+ movdqa xmm5, OWORD PTR [ecx]
+ movdqu xmm0, OWORD PTR [edx]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm0
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm6, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm0, xmm6
+ movdqa xmm1, xmm4
+ psrld xmm0, 31
+ psrld xmm1, 31
+ pslld xmm6, 1
+ pslld xmm4, 1
+ movdqa xmm2, xmm0
+ pslldq xmm0, 4
+ psrldq xmm2, 12
+ pslldq xmm1, 4
+ por xmm4, xmm2
+ por xmm6, xmm0
+ por xmm4, xmm1
+ movdqa xmm0, xmm6
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm6
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm6, xmm0
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm6
+ pxor xmm4, xmm2
+ movdqa OWORD PTR [eax], xmm4
+ ret
+AES_GCM_ghash_block_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_update_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 96
+ mov esi, DWORD PTR [esp+144]
+ movdqa xmm4, OWORD PTR [esi]
+ movdqu OWORD PTR [esp+64], xmm4
+ mov esi, DWORD PTR [esp+136]
+ mov ebp, DWORD PTR [esp+140]
+ movdqa xmm6, OWORD PTR [esi]
+ movdqa xmm5, OWORD PTR [ebp]
+ movdqu OWORD PTR [esp+80], xmm6
+ mov ebp, DWORD PTR [esp+116]
+ mov edi, DWORD PTR [esp+124]
+ mov esi, DWORD PTR [esp+128]
+ movdqa xmm1, xmm5
+ movdqa xmm0, xmm5
+ psrlq xmm1, 63
+ psllq xmm0, 1
+ pslldq xmm1, 8
+ por xmm0, xmm1
+ pshufd xmm5, xmm5, 255
+ psrad xmm5, 31
+ pand xmm5, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+132], 64
+ mov eax, DWORD PTR [esp+132]
+ jl L_AES_GCM_encrypt_update_aesni_done_64
+ and eax, 4294967232
+ movdqa xmm2, xmm6
+ ; H ^ 1
+ movdqu OWORD PTR [esp], xmm5
+ ; H ^ 2
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm4, xmm1
+ movdqu OWORD PTR [esp+16], xmm4
+ ; H ^ 3
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm4, 78
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm4
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm7, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [esp+32], xmm7
+ ; H ^ 4
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm4, 78
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm4
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm7, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [esp+48], xmm7
+ ; First 64 bytes of input
+ ; Encrypt 64 bytes of counter
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqu xmm7, xmm0
+ paddd xmm7, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm7
+ movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pshufb xmm0, xmm7
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pshufb xmm1, xmm7
+ paddd xmm2, OWORD PTR L_aes_gcm_two
+ pshufb xmm2, xmm7
+ paddd xmm3, OWORD PTR L_aes_gcm_three
+ pshufb xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp]
+ pxor xmm0, xmm7
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+16]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+32]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+48]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+64]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+80]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+96]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+112]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+128]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+144]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ movdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_aesni_enc_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+176]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ movdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_aesni_enc_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+208]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_aesni_enc_done:
+ aesenclast xmm0, xmm7
+ aesenclast xmm1, xmm7
+ movdqu xmm4, OWORD PTR [esi]
+ movdqu xmm5, OWORD PTR [esi+16]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ movdqu OWORD PTR [edi], xmm0
+ movdqu OWORD PTR [edi+16], xmm1
+ aesenclast xmm2, xmm7
+ aesenclast xmm3, xmm7
+ movdqu xmm4, OWORD PTR [esi+32]
+ movdqu xmm5, OWORD PTR [esi+48]
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [edi+32], xmm2
+ movdqu OWORD PTR [edi+48], xmm3
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_update_aesni_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_update_aesni_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqu xmm7, xmm0
+ paddd xmm7, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm7
+ movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pshufb xmm0, xmm7
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pshufb xmm1, xmm7
+ paddd xmm2, OWORD PTR L_aes_gcm_two
+ pshufb xmm2, xmm7
+ paddd xmm3, OWORD PTR L_aes_gcm_three
+ pshufb xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp]
+ pxor xmm0, xmm7
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+16]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+32]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+48]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+64]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+80]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+96]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+112]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+128]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+144]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ movdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+176]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ movdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+208]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done:
+ aesenclast xmm0, xmm7
+ aesenclast xmm1, xmm7
+ movdqu xmm4, OWORD PTR [ecx]
+ movdqu xmm5, OWORD PTR [ecx+16]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ movdqu OWORD PTR [edx], xmm0
+ movdqu OWORD PTR [edx+16], xmm1
+ aesenclast xmm2, xmm7
+ aesenclast xmm3, xmm7
+ movdqu xmm4, OWORD PTR [ecx+32]
+ movdqu xmm5, OWORD PTR [ecx+48]
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [edx+32], xmm2
+ movdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ movdqu xmm2, OWORD PTR [esp+80]
+ movdqu xmm7, OWORD PTR [esp+48]
+ movdqu xmm0, OWORD PTR [edx+-64]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm2
+ pshufd xmm1, xmm7, 78
+ pshufd xmm5, xmm0, 78
+ pxor xmm1, xmm7
+ pxor xmm5, xmm0
+ movdqa xmm3, xmm0
+ pclmulqdq xmm3, xmm7, 17
+ movdqa xmm2, xmm0
+ pclmulqdq xmm2, xmm7, 0
+ pclmulqdq xmm1, xmm5, 0
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqu xmm7, OWORD PTR [esp+32]
+ movdqu xmm0, OWORD PTR [edx+-48]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp+16]
+ movdqu xmm0, OWORD PTR [edx+-32]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp]
+ movdqu xmm0, OWORD PTR [edx+-16]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqa xmm5, xmm1
+ psrldq xmm1, 8
+ pslldq xmm5, 8
+ pxor xmm2, xmm5
+ pxor xmm3, xmm1
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm2
+ pslld xmm7, 31
+ pslld xmm4, 30
+ pslld xmm5, 25
+ pxor xmm7, xmm4
+ pxor xmm7, xmm5
+ movdqa xmm4, xmm7
+ pslldq xmm7, 12
+ psrldq xmm4, 4
+ pxor xmm2, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm1, xmm2
+ movdqa xmm0, xmm2
+ psrld xmm5, 1
+ psrld xmm1, 2
+ psrld xmm0, 7
+ pxor xmm5, xmm1
+ pxor xmm5, xmm0
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+ pxor xmm2, xmm3
+ movdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_aesni_ghash_64
+L_AES_GCM_encrypt_update_aesni_end_64:
+ movdqu xmm6, OWORD PTR [esp+80]
+ ; Block 1
+ movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm5, OWORD PTR [edx]
+ pshufb xmm5, xmm0
+ movdqu xmm7, OWORD PTR [esp+48]
+ pxor xmm5, xmm6
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm7, 78
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm7
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm4, xmm2
+ pxor xmm6, xmm1
+ ; Block 2
+ movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm5, OWORD PTR [edx+16]
+ pshufb xmm5, xmm0
+ movdqu xmm7, OWORD PTR [esp+32]
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm7, 78
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm7
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ pxor xmm4, xmm0
+ pxor xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm4, xmm2
+ pxor xmm6, xmm1
+ ; Block 3
+ movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm5, OWORD PTR [edx+32]
+ pshufb xmm5, xmm0
+ movdqu xmm7, OWORD PTR [esp+16]
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm7, 78
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm7
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ pxor xmm4, xmm0
+ pxor xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm4, xmm2
+ pxor xmm6, xmm1
+ ; Block 4
+ movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm5, OWORD PTR [edx+48]
+ pshufb xmm5, xmm0
+ movdqu xmm7, OWORD PTR [esp]
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm7, 78
+ movdqa xmm3, xmm7
+ movdqa xmm0, xmm7
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm7
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ pxor xmm4, xmm0
+ pxor xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm4, xmm2
+ pxor xmm6, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm4
+ movdqa xmm2, xmm4
+ pslld xmm0, 31
+ pslld xmm1, 30
+ pslld xmm2, 25
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ pslldq xmm0, 12
+ pxor xmm4, xmm0
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ psrld xmm2, 1
+ psrld xmm3, 2
+ psrld xmm0, 7
+ pxor xmm2, xmm3
+ pxor xmm2, xmm0
+ pxor xmm2, xmm1
+ pxor xmm2, xmm4
+ pxor xmm6, xmm2
+ movdqu xmm5, OWORD PTR [esp]
+L_AES_GCM_encrypt_update_aesni_done_64:
+ mov edx, DWORD PTR [esp+132]
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_update_aesni_done_enc
+ mov eax, DWORD PTR [esp+132]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_aesni_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqa xmm1, xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pxor xmm0, [ebp]
+ movdqu OWORD PTR [esp+64], xmm1
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ aesenc xmm0, [ebp+96]
+ aesenc xmm0, [ebp+112]
+ aesenc xmm0, [ebp+128]
+ aesenc xmm0, [ebp+144]
+ cmp DWORD PTR [esp+120], 11
+ movdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+120], 13
+ movdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+208]
+ movdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
+ aesenclast xmm0, xmm1
+ movdqu xmm1, OWORD PTR [ecx]
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [edx], xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm6, xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_aesni_last_block_ghash
+L_AES_GCM_encrypt_update_aesni_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqa xmm1, xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pxor xmm0, [ebp]
+ movdqu OWORD PTR [esp+64], xmm1
+ movdqu xmm4, xmm6
+ pclmulqdq xmm4, xmm5, 16
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ movdqu xmm7, xmm6
+ pclmulqdq xmm7, xmm5, 1
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ movdqu xmm1, xmm6
+ pclmulqdq xmm1, xmm5, 17
+ aesenc xmm0, [ebp+96]
+ pxor xmm4, xmm7
+ movdqa xmm2, xmm4
+ psrldq xmm4, 8
+ pslldq xmm2, 8
+ aesenc xmm0, [ebp+112]
+ movdqu xmm7, xmm6
+ pclmulqdq xmm7, xmm5, 0
+ pxor xmm2, xmm7
+ pxor xmm1, xmm4
+ movdqa xmm3, OWORD PTR L_aes_gcm_mod2_128
+ movdqa xmm7, xmm2
+ pclmulqdq xmm7, xmm3, 16
+ aesenc xmm0, [ebp+128]
+ pshufd xmm4, xmm2, 78
+ pxor xmm4, xmm7
+ movdqa xmm7, xmm4
+ pclmulqdq xmm7, xmm3, 16
+ aesenc xmm0, [ebp+144]
+ pshufd xmm6, xmm4, 78
+ pxor xmm6, xmm7
+ pxor xmm6, xmm1
+ cmp DWORD PTR [esp+120], 11
+ movdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+120], 13
+ movdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+208]
+ movdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
+ aesenclast xmm0, xmm1
+ movdqu xmm1, OWORD PTR [ecx]
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [edx], xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm6, xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_aesni_last_block_start
+L_AES_GCM_encrypt_update_aesni_last_block_ghash:
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm6, 78
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm6, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm6, xmm1
+L_AES_GCM_encrypt_update_aesni_last_block_done:
+L_AES_GCM_encrypt_update_aesni_done_enc:
+ mov esi, DWORD PTR [esp+136]
+ mov edi, DWORD PTR [esp+144]
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqa OWORD PTR [esi], xmm6
+ movdqu OWORD PTR [edi], xmm4
+ add esp, 96
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_update_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_final_aesni PROC
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+32]
+ mov esi, DWORD PTR [esp+52]
+ mov edi, DWORD PTR [esp+56]
+ movdqa xmm4, OWORD PTR [ebp]
+ movdqa xmm5, OWORD PTR [esi]
+ movdqa xmm6, OWORD PTR [edi]
+ movdqa xmm1, xmm5
+ movdqa xmm0, xmm5
+ psrlq xmm1, 63
+ psllq xmm0, 1
+ pslldq xmm1, 8
+ por xmm0, xmm1
+ pshufd xmm5, xmm5, 255
+ psrad xmm5, 31
+ pand xmm5, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm5, xmm0
+ mov edx, DWORD PTR [esp+44]
+ mov ecx, DWORD PTR [esp+48]
+ shl edx, 3
+ shl ecx, 3
+ pinsrd xmm0, edx, 0
+ pinsrd xmm0, ecx, 2
+ mov edx, DWORD PTR [esp+44]
+ mov ecx, DWORD PTR [esp+48]
+ shr edx, 29
+ shr ecx, 29
+ pinsrd xmm0, edx, 1
+ pinsrd xmm0, ecx, 3
+ pxor xmm4, xmm0
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm4, 78
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm4
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm4, xmm1
+ pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm0, xmm6
+ pxor xmm0, xmm4
+ mov edi, DWORD PTR [esp+36]
+ cmp DWORD PTR [esp+40], 16
+ je L_AES_GCM_encrypt_final_aesni_store_tag_16
+ xor ecx, ecx
+ movdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_final_aesni_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+40]
+ jne L_AES_GCM_encrypt_final_aesni_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_aesni_store_tag_done
+L_AES_GCM_encrypt_final_aesni_store_tag_16:
+ movdqu OWORD PTR [edi], xmm0
+L_AES_GCM_encrypt_final_aesni_store_tag_done:
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_final_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_update_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 160
+ mov esi, DWORD PTR [esp+208]
+ movdqa xmm4, OWORD PTR [esi]
+ movdqu OWORD PTR [esp+64], xmm4
+ mov esi, DWORD PTR [esp+200]
+ mov ebp, DWORD PTR [esp+204]
+ movdqa xmm6, OWORD PTR [esi]
+ movdqa xmm5, OWORD PTR [ebp]
+ movdqu OWORD PTR [esp+80], xmm6
+ mov ebp, DWORD PTR [esp+180]
+ mov edi, DWORD PTR [esp+188]
+ mov esi, DWORD PTR [esp+192]
+ movdqa xmm1, xmm5
+ movdqa xmm0, xmm5
+ psrlq xmm1, 63
+ psllq xmm0, 1
+ pslldq xmm1, 8
+ por xmm0, xmm1
+ pshufd xmm5, xmm5, 255
+ psrad xmm5, 31
+ pand xmm5, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+196], 64
+ mov eax, DWORD PTR [esp+196]
+ jl L_AES_GCM_decrypt_update_aesni_done_64
+ and eax, 4294967232
+ movdqa xmm2, xmm6
+ ; H ^ 1
+ movdqu OWORD PTR [esp], xmm5
+ ; H ^ 2
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm5, 78
+ movdqa xmm3, xmm5
+ movdqa xmm0, xmm5
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm5
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm4, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm4, xmm1
+ movdqu OWORD PTR [esp+16], xmm4
+ ; H ^ 3
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm4, 78
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm4
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm7, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [esp+32], xmm7
+ ; H ^ 4
+ pshufd xmm1, xmm4, 78
+ pshufd xmm2, xmm4, 78
+ movdqa xmm3, xmm4
+ movdqa xmm0, xmm4
+ pclmulqdq xmm3, xmm4, 17
+ pclmulqdq xmm0, xmm4, 0
+ pxor xmm1, xmm4
+ pxor xmm2, xmm4
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm7, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm7, xmm1
+ movdqu OWORD PTR [esp+48], xmm7
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_update_aesni_ghash_64
+L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqu xmm7, xmm0
+ paddd xmm7, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm7
+ movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pshufb xmm0, xmm7
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pshufb xmm1, xmm7
+ paddd xmm2, OWORD PTR L_aes_gcm_two
+ pshufb xmm2, xmm7
+ paddd xmm3, OWORD PTR L_aes_gcm_three
+ pshufb xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp]
+ pxor xmm0, xmm7
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+16]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+32]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+48]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+64]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+80]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+96]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+112]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+128]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+144]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ movdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+176]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ movdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+208]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done:
+ aesenclast xmm0, xmm7
+ aesenclast xmm1, xmm7
+ movdqu xmm4, OWORD PTR [ecx]
+ movdqu xmm5, OWORD PTR [ecx+16]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ movdqu OWORD PTR [esp+96], xmm4
+ movdqu OWORD PTR [esp+112], xmm5
+ movdqu OWORD PTR [edx], xmm0
+ movdqu OWORD PTR [edx+16], xmm1
+ aesenclast xmm2, xmm7
+ aesenclast xmm3, xmm7
+ movdqu xmm4, OWORD PTR [ecx+32]
+ movdqu xmm5, OWORD PTR [ecx+48]
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [esp+128], xmm4
+ movdqu OWORD PTR [esp+144], xmm5
+ movdqu OWORD PTR [edx+32], xmm2
+ movdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ movdqu xmm2, OWORD PTR [esp+80]
+ movdqu xmm7, OWORD PTR [esp+48]
+ movdqu xmm0, OWORD PTR [esp+96]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm2
+ pshufd xmm1, xmm7, 78
+ pshufd xmm5, xmm0, 78
+ pxor xmm1, xmm7
+ pxor xmm5, xmm0
+ movdqa xmm3, xmm0
+ pclmulqdq xmm3, xmm7, 17
+ movdqa xmm2, xmm0
+ pclmulqdq xmm2, xmm7, 0
+ pclmulqdq xmm1, xmm5, 0
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqu xmm7, OWORD PTR [esp+32]
+ movdqu xmm0, OWORD PTR [esp+112]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp+16]
+ movdqu xmm0, OWORD PTR [esp+128]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp]
+ movdqu xmm0, OWORD PTR [esp+144]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqa xmm5, xmm1
+ psrldq xmm1, 8
+ pslldq xmm5, 8
+ pxor xmm2, xmm5
+ pxor xmm3, xmm1
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm2
+ pslld xmm7, 31
+ pslld xmm4, 30
+ pslld xmm5, 25
+ pxor xmm7, xmm4
+ pxor xmm7, xmm5
+ movdqa xmm4, xmm7
+ pslldq xmm7, 12
+ psrldq xmm4, 4
+ pxor xmm2, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm1, xmm2
+ movdqa xmm0, xmm2
+ psrld xmm5, 1
+ psrld xmm1, 2
+ psrld xmm0, 7
+ pxor xmm5, xmm1
+ pxor xmm5, xmm0
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+ pxor xmm2, xmm3
+ movdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_aesni_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_update_aesni_ghash_64_done
+L_AES_GCM_decrypt_update_aesni_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; Encrypt 64 bytes of counter
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqu xmm7, xmm0
+ paddd xmm7, OWORD PTR L_aes_gcm_four
+ movdqu OWORD PTR [esp+64], xmm7
+ movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pshufb xmm0, xmm7
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pshufb xmm1, xmm7
+ paddd xmm2, OWORD PTR L_aes_gcm_two
+ pshufb xmm2, xmm7
+ paddd xmm3, OWORD PTR L_aes_gcm_three
+ pshufb xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp]
+ pxor xmm0, xmm7
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+16]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+32]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+48]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+64]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+80]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+96]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+112]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+128]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+144]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ movdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+176]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ movdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+208]
+ aesenc xmm0, xmm7
+ aesenc xmm1, xmm7
+ aesenc xmm2, xmm7
+ aesenc xmm3, xmm7
+ movdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
+ aesenclast xmm0, xmm7
+ aesenclast xmm1, xmm7
+ movdqu xmm4, OWORD PTR [ecx]
+ movdqu xmm5, OWORD PTR [ecx+16]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ movdqu OWORD PTR [edx], xmm0
+ movdqu OWORD PTR [edx+16], xmm1
+ aesenclast xmm2, xmm7
+ aesenclast xmm3, xmm7
+ movdqu xmm4, OWORD PTR [ecx+32]
+ movdqu xmm5, OWORD PTR [ecx+48]
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movdqu OWORD PTR [edx+32], xmm2
+ movdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ movdqu xmm2, OWORD PTR [esp+80]
+ movdqu xmm7, OWORD PTR [esp+48]
+ movdqu xmm0, OWORD PTR [ecx]
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm0, xmm2
+ pshufd xmm1, xmm7, 78
+ pshufd xmm5, xmm0, 78
+ pxor xmm1, xmm7
+ pxor xmm5, xmm0
+ movdqa xmm3, xmm0
+ pclmulqdq xmm3, xmm7, 17
+ movdqa xmm2, xmm0
+ pclmulqdq xmm2, xmm7, 0
+ pclmulqdq xmm1, xmm5, 0
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqu xmm7, OWORD PTR [esp+32]
+ movdqu xmm0, OWORD PTR [ecx+16]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp+16]
+ movdqu xmm0, OWORD PTR [ecx+32]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqu xmm7, OWORD PTR [esp]
+ movdqu xmm0, OWORD PTR [ecx+48]
+ pshufd xmm4, xmm7, 78
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm4, xmm7
+ pshufd xmm5, xmm0, 78
+ pxor xmm5, xmm0
+ movdqa xmm6, xmm0
+ pclmulqdq xmm6, xmm7, 17
+ pclmulqdq xmm7, xmm0, 0
+ pclmulqdq xmm4, xmm5, 0
+ pxor xmm1, xmm7
+ pxor xmm2, xmm7
+ pxor xmm1, xmm6
+ pxor xmm3, xmm6
+ pxor xmm1, xmm4
+ movdqa xmm5, xmm1
+ psrldq xmm1, 8
+ pslldq xmm5, 8
+ pxor xmm2, xmm5
+ pxor xmm3, xmm1
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm2
+ pslld xmm7, 31
+ pslld xmm4, 30
+ pslld xmm5, 25
+ pxor xmm7, xmm4
+ pxor xmm7, xmm5
+ movdqa xmm4, xmm7
+ pslldq xmm7, 12
+ psrldq xmm4, 4
+ pxor xmm2, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm1, xmm2
+ movdqa xmm0, xmm2
+ psrld xmm5, 1
+ psrld xmm1, 2
+ psrld xmm0, 7
+ pxor xmm5, xmm1
+ pxor xmm5, xmm0
+ pxor xmm5, xmm4
+ pxor xmm2, xmm5
+ pxor xmm2, xmm3
+ movdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_aesni_ghash_64
+L_AES_GCM_decrypt_update_aesni_ghash_64_done:
+ movdqa xmm6, xmm2
+ movdqu xmm5, OWORD PTR [esp]
+L_AES_GCM_decrypt_update_aesni_done_64:
+ mov edx, DWORD PTR [esp+196]
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_update_aesni_done_dec
+ mov eax, DWORD PTR [esp+196]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_update_aesni_last_block_done
+L_AES_GCM_decrypt_update_aesni_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ movdqu xmm1, OWORD PTR [ecx]
+ pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask
+ pxor xmm1, xmm6
+ movdqu OWORD PTR [esp], xmm1
+ movdqu xmm0, OWORD PTR [esp+64]
+ movdqa xmm1, xmm0
+ pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64
+ paddd xmm1, OWORD PTR L_aes_gcm_one
+ pxor xmm0, [ebp]
+ movdqu OWORD PTR [esp+64], xmm1
+ movdqu xmm4, OWORD PTR [esp]
+ pclmulqdq xmm4, xmm5, 16
+ aesenc xmm0, [ebp+16]
+ aesenc xmm0, [ebp+32]
+ movdqu xmm7, OWORD PTR [esp]
+ pclmulqdq xmm7, xmm5, 1
+ aesenc xmm0, [ebp+48]
+ aesenc xmm0, [ebp+64]
+ aesenc xmm0, [ebp+80]
+ movdqu xmm1, OWORD PTR [esp]
+ pclmulqdq xmm1, xmm5, 17
+ aesenc xmm0, [ebp+96]
+ pxor xmm4, xmm7
+ movdqa xmm2, xmm4
+ psrldq xmm4, 8
+ pslldq xmm2, 8
+ aesenc xmm0, [ebp+112]
+ movdqu xmm7, OWORD PTR [esp]
+ pclmulqdq xmm7, xmm5, 0
+ pxor xmm2, xmm7
+ pxor xmm1, xmm4
+ movdqa xmm3, OWORD PTR L_aes_gcm_mod2_128
+ movdqa xmm7, xmm2
+ pclmulqdq xmm7, xmm3, 16
+ aesenc xmm0, [ebp+128]
+ pshufd xmm4, xmm2, 78
+ pxor xmm4, xmm7
+ movdqa xmm7, xmm4
+ pclmulqdq xmm7, xmm3, 16
+ aesenc xmm0, [ebp+144]
+ pshufd xmm6, xmm4, 78
+ pxor xmm6, xmm7
+ pxor xmm6, xmm1
+ cmp DWORD PTR [esp+184], 11
+ movdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+176]
+ cmp DWORD PTR [esp+184], 13
+ movdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+ aesenc xmm0, xmm1
+ aesenc xmm0, [ebp+208]
+ movdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
+ aesenclast xmm0, xmm1
+ movdqu xmm1, OWORD PTR [ecx]
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [edx], xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_aesni_last_block_start
+L_AES_GCM_decrypt_update_aesni_last_block_done:
+L_AES_GCM_decrypt_update_aesni_done_dec:
+ mov esi, DWORD PTR [esp+200]
+ mov edi, DWORD PTR [esp+208]
+ movdqu xmm4, OWORD PTR [esp+64]
+ movdqa OWORD PTR [esi], xmm6
+ movdqu OWORD PTR [edi], xmm4
+ add esp, 160
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_update_aesni ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_final_aesni PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+36]
+ mov esi, DWORD PTR [esp+56]
+ mov edi, DWORD PTR [esp+60]
+ movdqa xmm6, OWORD PTR [ebp]
+ movdqa xmm5, OWORD PTR [esi]
+ movdqa xmm7, OWORD PTR [edi]
+ movdqa xmm1, xmm5
+ movdqa xmm0, xmm5
+ psrlq xmm1, 63
+ psllq xmm0, 1
+ pslldq xmm1, 8
+ por xmm0, xmm1
+ pshufd xmm5, xmm5, 255
+ psrad xmm5, 31
+ pand xmm5, OWORD PTR L_aes_gcm_mod2_128
+ pxor xmm5, xmm0
+ mov edx, DWORD PTR [esp+48]
+ mov ecx, DWORD PTR [esp+52]
+ shl edx, 3
+ shl ecx, 3
+ pinsrd xmm0, edx, 0
+ pinsrd xmm0, ecx, 2
+ mov edx, DWORD PTR [esp+48]
+ mov ecx, DWORD PTR [esp+52]
+ shr edx, 29
+ shr ecx, 29
+ pinsrd xmm0, edx, 1
+ pinsrd xmm0, ecx, 3
+ pxor xmm6, xmm0
+ pshufd xmm1, xmm5, 78
+ pshufd xmm2, xmm6, 78
+ movdqa xmm3, xmm6
+ movdqa xmm0, xmm6
+ pclmulqdq xmm3, xmm5, 17
+ pclmulqdq xmm0, xmm5, 0
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pclmulqdq xmm1, xmm2, 0
+ pxor xmm1, xmm0
+ pxor xmm1, xmm3
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm3
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ pxor xmm0, xmm2
+ pxor xmm6, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ pslld xmm1, 31
+ pslld xmm2, 30
+ pslld xmm3, 25
+ pxor xmm1, xmm2
+ pxor xmm1, xmm3
+ movdqa xmm3, xmm1
+ psrldq xmm3, 4
+ pslldq xmm1, 12
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psrld xmm1, 1
+ psrld xmm2, 2
+ pxor xmm1, xmm2
+ pxor xmm1, xmm0
+ psrld xmm0, 7
+ pxor xmm1, xmm3
+ pxor xmm1, xmm0
+ pxor xmm6, xmm1
+ pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask
+ movdqu xmm0, xmm7
+ pxor xmm0, xmm6
+ mov esi, DWORD PTR [esp+40]
+ mov edi, DWORD PTR [esp+64]
+ cmp DWORD PTR [esp+44], 16
+ je L_AES_GCM_decrypt_final_aesni_cmp_tag_16
+ sub esp, 16
+ xor ecx, ecx
+ xor ebx, ebx
+ movdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ xor al, BYTE PTR [esi+ecx]
+ or bl, al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+44]
+ jne L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add esp, 16
+ xor ecx, ecx
+ jmp L_AES_GCM_decrypt_final_aesni_cmp_tag_done
+L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
+ movdqu xmm1, OWORD PTR [esi]
+ pcmpeqb xmm0, xmm1
+ pmovmskb edx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
+ mov DWORD PTR [edi], ebx
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_final_aesni ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX1
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 112
+ mov esi, DWORD PTR [esp+144]
+ mov ebp, DWORD PTR [esp+168]
+ mov edx, DWORD PTR [esp+160]
+ vpxor xmm0, xmm0, xmm0
+ vpxor xmm2, xmm2, xmm2
+ cmp edx, 12
+ jne L_AES_GCM_encrypt_avx1_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vpinsrd xmm0, xmm0, DWORD PTR [esi], 0
+ vpinsrd xmm0, xmm0, DWORD PTR [esi+4], 1
+ vpinsrd xmm0, xmm0, DWORD PTR [esi+8], 2
+ vpinsrd xmm0, xmm0, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm1, OWORD PTR [ebp]
+ vpxor xmm5, xmm0, xmm1
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_12_last
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_12_last
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_calc_iv_12_last:
+ vaesenclast xmm1, xmm1, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu OWORD PTR [esp+80], xmm5
+ jmp L_AES_GCM_encrypt_avx1_iv_done
+L_AES_GCM_encrypt_avx1_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm1, OWORD PTR [ebp]
+ vaesenc xmm1, xmm1, [ebp+16]
+ vaesenc xmm1, xmm1, [ebp+32]
+ vaesenc xmm1, xmm1, [ebp+48]
+ vaesenc xmm1, xmm1, [ebp+64]
+ vaesenc xmm1, xmm1, [ebp+80]
+ vaesenc xmm1, xmm1, [ebp+96]
+ vaesenc xmm1, xmm1, [ebp+112]
+ vaesenc xmm1, xmm1, [ebp+128]
+ vaesenc xmm1, xmm1, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm1, xmm1, xmm5
+ vaesenc xmm1, xmm1, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm1, xmm1, xmm5
+ vaesenc xmm1, xmm1, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm1, xmm1, xmm5
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_encrypt_avx1_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx1_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
+ vmovdqu xmm4, OWORD PTR [esi+ecx]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx1_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+160]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx1_calc_iv_done
+L_AES_GCM_encrypt_avx1_calc_iv_lt16:
+ sub esp, 16
+ vpxor xmm4, xmm4, xmm4
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_avx1_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx1_calc_iv_loop
+ vmovdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+L_AES_GCM_encrypt_avx1_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm4, xmm4, xmm4
+ shl edx, 3
+ vpinsrd xmm4, xmm4, edx, 0
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm4, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm0
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vaesenc xmm4, xmm4, [ebp+96]
+ vaesenc xmm4, xmm4, [ebp+112]
+ vaesenc xmm4, xmm4, [ebp+128]
+ vaesenc xmm4, xmm4, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm5
+ vmovdqu OWORD PTR [esp+80], xmm4
+L_AES_GCM_encrypt_avx1_iv_done:
+ mov esi, DWORD PTR [esp+140]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+156]
+ cmp edx, 0
+ je L_AES_GCM_encrypt_avx1_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx1_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
+ vmovdqu xmm4, OWORD PTR [esi+ecx]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm2, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ vpclmulqdq xmm4, xmm1, xmm2, 0
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm2, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm2, xmm2, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm2, xmm2, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm2, xmm2, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx1_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+156]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx1_calc_aad_done
+L_AES_GCM_encrypt_avx1_calc_aad_lt16:
+ sub esp, 16
+ vpxor xmm4, xmm4, xmm4
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_avx1_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx1_calc_aad_loop
+ vmovdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm2, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ vpclmulqdq xmm4, xmm1, xmm2, 0
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm2, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm2, xmm2, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm2, xmm2, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm2, xmm2, xmm6
+L_AES_GCM_encrypt_avx1_calc_aad_done:
+ vmovdqu OWORD PTR [esp+96], xmm2
+ mov esi, DWORD PTR [esp+132]
+ mov edi, DWORD PTR [esp+136]
+ ; Calculate counter and H
+ vpsrlq xmm5, xmm1, 63
+ vpsllq xmm4, xmm1, 1
+ vpslldq xmm5, xmm5, 8
+ vpor xmm4, xmm4, xmm5
+ vpshufd xmm1, xmm1, 255
+ vpsrad xmm1, xmm1, 31
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpand xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpaddd xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+64], xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+152], 64
+ mov eax, DWORD PTR [esp+152]
+ jl L_AES_GCM_encrypt_avx1_done_64
+ and eax, 4294967232
+ vmovdqa xmm6, xmm2
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm1
+ ; H ^ 2
+ vpclmulqdq xmm4, xmm1, xmm1, 0
+ vpclmulqdq xmm0, xmm1, xmm1, 17
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm0, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm0, 78
+ vpclmulqdq xmm7, xmm0, xmm1, 17
+ vpclmulqdq xmm4, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm0
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm3, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [esp+32], xmm3
+ ; H ^ 4
+ vpclmulqdq xmm4, xmm0, xmm0, 0
+ vpclmulqdq xmm3, xmm0, xmm0, 17
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm3
+ ; First 64 bytes of input
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm3
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm5, xmm5, xmm3
+ vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm6, xmm6, xmm3
+ vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm7, xmm7, xmm3
+ vpshufb xmm4, xmm4, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
+ vaesenclast xmm4, xmm4, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vmovdqu xmm0, OWORD PTR [esi]
+ vmovdqu xmm1, OWORD PTR [esi+16]
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu OWORD PTR [edi], xmm4
+ vmovdqu OWORD PTR [edi+16], xmm5
+ vaesenclast xmm6, xmm6, xmm3
+ vaesenclast xmm7, xmm7, xmm3
+ vmovdqu xmm0, OWORD PTR [esi+32]
+ vmovdqu xmm1, OWORD PTR [esi+48]
+ vpxor xmm6, xmm6, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [edi+32], xmm6
+ vmovdqu OWORD PTR [edi+48], xmm7
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_avx1_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_avx1_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm3
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm5, xmm5, xmm3
+ vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm6, xmm6, xmm3
+ vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm7, xmm7, xmm3
+ vpshufb xmm4, xmm4, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm4, xmm4, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx]
+ vmovdqu xmm1, OWORD PTR [ecx+16]
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu OWORD PTR [edx], xmm4
+ vmovdqu OWORD PTR [edx+16], xmm5
+ vaesenclast xmm6, xmm6, xmm3
+ vaesenclast xmm7, xmm7, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx+32]
+ vmovdqu xmm1, OWORD PTR [ecx+48]
+ vpxor xmm6, xmm6, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [edx+32], xmm6
+ vmovdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ vmovdqu xmm6, OWORD PTR [esp+96]
+ vmovdqu xmm3, OWORD PTR [esp+48]
+ vmovdqu xmm4, OWORD PTR [edx+-64]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm5, xmm3, 78
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm7, xmm4, xmm3, 17
+ vpclmulqdq xmm6, xmm4, xmm3, 0
+ vpclmulqdq xmm5, xmm5, xmm1, 0
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vmovdqu xmm3, OWORD PTR [esp+32]
+ vmovdqu xmm4, OWORD PTR [edx+-48]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp+16]
+ vmovdqu xmm4, OWORD PTR [edx+-32]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [edx+-16]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpslld xmm3, xmm6, 31
+ vpslld xmm0, xmm6, 30
+ vpslld xmm1, xmm6, 25
+ vpxor xmm3, xmm3, xmm0
+ vpxor xmm3, xmm3, xmm1
+ vpsrldq xmm0, xmm3, 4
+ vpslldq xmm3, xmm3, 12
+ vpxor xmm6, xmm6, xmm3
+ vpsrld xmm1, xmm6, 1
+ vpsrld xmm5, xmm6, 2
+ vpsrld xmm4, xmm6, 7
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_avx1_ghash_64
+L_AES_GCM_encrypt_avx1_end_64:
+ vmovdqu xmm2, OWORD PTR [esp+96]
+ ; Block 1
+ vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm1, OWORD PTR [edx]
+ vpshufb xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm2
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm3, 78
+ vpclmulqdq xmm7, xmm3, xmm1, 17
+ vpclmulqdq xmm4, xmm3, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm0, xmm4
+ vmovdqa xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm0, xmm0, xmm6
+ vpxor xmm2, xmm2, xmm5
+ ; Block 2
+ vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm1, OWORD PTR [edx+16]
+ vpshufb xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [esp+32]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm3, 78
+ vpclmulqdq xmm7, xmm3, xmm1, 17
+ vpclmulqdq xmm4, xmm3, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm2, xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm0, xmm0, xmm6
+ vpxor xmm2, xmm2, xmm5
+ ; Block 3
+ vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm1, OWORD PTR [edx+32]
+ vpshufb xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [esp+16]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm3, 78
+ vpclmulqdq xmm7, xmm3, xmm1, 17
+ vpclmulqdq xmm4, xmm3, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm2, xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm0, xmm0, xmm6
+ vpxor xmm2, xmm2, xmm5
+ ; Block 4
+ vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm1, OWORD PTR [edx+48]
+ vpshufb xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [esp]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm3, 78
+ vpclmulqdq xmm7, xmm3, xmm1, 17
+ vpclmulqdq xmm4, xmm3, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm2, xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm0, xmm0, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpslld xmm4, xmm0, 31
+ vpslld xmm5, xmm0, 30
+ vpslld xmm6, xmm0, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm0, xmm0, xmm4
+ vpsrld xmm6, xmm0, 1
+ vpsrld xmm7, xmm0, 2
+ vpsrld xmm4, xmm0, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm0
+ vpxor xmm2, xmm2, xmm6
+ vmovdqu xmm1, OWORD PTR [esp]
+L_AES_GCM_encrypt_avx1_done_64:
+ mov edx, DWORD PTR [esp+152]
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_avx1_done_enc
+ mov eax, DWORD PTR [esp+152]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_avx1_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm5, OWORD PTR [esp+64]
+ vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm5
+ vpxor xmm4, xmm4, [ebp]
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vaesenc xmm4, xmm4, [ebp+96]
+ vaesenc xmm4, xmm4, [ebp+112]
+ vaesenc xmm4, xmm4, [ebp+128]
+ vaesenc xmm4, xmm4, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm5
+ vmovdqu xmm5, OWORD PTR [ecx]
+ vpxor xmm4, xmm4, xmm5
+ vmovdqu OWORD PTR [edx], xmm4
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm4
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_avx1_last_block_ghash
+L_AES_GCM_encrypt_avx1_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm5, OWORD PTR [esp+64]
+ vmovdqu xmm7, xmm2
+ vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm5
+ vpxor xmm4, xmm4, [ebp]
+ vpclmulqdq xmm0, xmm7, xmm1, 16
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vpclmulqdq xmm3, xmm7, xmm1, 1
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vpclmulqdq xmm5, xmm7, xmm1, 17
+ vaesenc xmm4, xmm4, [ebp+96]
+ vpxor xmm0, xmm0, xmm3
+ vpslldq xmm6, xmm0, 8
+ vpsrldq xmm0, xmm0, 8
+ vaesenc xmm4, xmm4, [ebp+112]
+ vpclmulqdq xmm3, xmm7, xmm1, 0
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm0
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpclmulqdq xmm3, xmm6, xmm7, 16
+ vaesenc xmm4, xmm4, [ebp+128]
+ vpshufd xmm0, xmm6, 78
+ vpxor xmm0, xmm0, xmm3
+ vpclmulqdq xmm3, xmm0, xmm7, 16
+ vaesenc xmm4, xmm4, [ebp+144]
+ vpshufd xmm2, xmm0, 78
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm5
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
+ vaesenclast xmm4, xmm4, xmm5
+ vmovdqu xmm5, OWORD PTR [ecx]
+ vpxor xmm4, xmm4, xmm5
+ vmovdqu OWORD PTR [edx], xmm4
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ add ebx, 16
+ vpxor xmm2, xmm2, xmm4
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_avx1_last_block_start
+L_AES_GCM_encrypt_avx1_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm2, 78
+ vpclmulqdq xmm7, xmm2, xmm1, 17
+ vpclmulqdq xmm4, xmm2, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm2
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm2, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+L_AES_GCM_encrypt_avx1_last_block_done:
+ mov ecx, DWORD PTR [esp+152]
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpxor xmm0, xmm0, [ebp]
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm5
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm5
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm5
+ sub esp, 16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ebx], al
+ mov BYTE PTR [esp+ecx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
+ xor eax, eax
+ cmp ecx, 16
+ je L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
+ mov BYTE PTR [esp+ecx], al
+ inc ecx
+ cmp ecx, 16
+ jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu xmm0, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm2, 78
+ vpclmulqdq xmm7, xmm2, xmm1, 17
+ vpclmulqdq xmm4, xmm2, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm2
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm2, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_avx1_done_enc:
+ mov edi, DWORD PTR [esp+148]
+ mov ebx, DWORD PTR [esp+164]
+ mov edx, DWORD PTR [esp+152]
+ mov ecx, DWORD PTR [esp+156]
+ shl edx, 3
+ shl ecx, 3
+ vpinsrd xmm4, xmm4, edx, 0
+ vpinsrd xmm4, xmm4, ecx, 2
+ mov edx, DWORD PTR [esp+152]
+ mov ecx, DWORD PTR [esp+156]
+ shr edx, 29
+ shr ecx, 29
+ vpinsrd xmm4, xmm4, edx, 1
+ vpinsrd xmm4, xmm4, ecx, 3
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm2, 78
+ vpclmulqdq xmm7, xmm2, xmm1, 17
+ vpclmulqdq xmm4, xmm2, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm2
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm2, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpshufb xmm2, xmm2, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm2, [esp+80]
+ cmp ebx, 16
+ je L_AES_GCM_encrypt_avx1_store_tag_16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_encrypt_avx1_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, ebx
+ jne L_AES_GCM_encrypt_avx1_store_tag_loop
+ jmp L_AES_GCM_encrypt_avx1_store_tag_done
+L_AES_GCM_encrypt_avx1_store_tag_16:
+ vmovdqu OWORD PTR [edi], xmm4
+L_AES_GCM_encrypt_avx1_store_tag_done:
+ add esp, 112
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 176
+ mov esi, DWORD PTR [esp+208]
+ mov ebp, DWORD PTR [esp+232]
+ mov edx, DWORD PTR [esp+224]
+ vpxor xmm0, xmm0, xmm0
+ vpxor xmm2, xmm2, xmm2
+ cmp edx, 12
+ jne L_AES_GCM_decrypt_avx1_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vpinsrd xmm0, xmm0, DWORD PTR [esi], 0
+ vpinsrd xmm0, xmm0, DWORD PTR [esi+4], 1
+ vpinsrd xmm0, xmm0, DWORD PTR [esi+8], 2
+ vpinsrd xmm0, xmm0, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm1, OWORD PTR [ebp]
+ vpxor xmm5, xmm0, xmm1
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_12_last
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_12_last
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm1, xmm1, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_calc_iv_12_last:
+ vaesenclast xmm1, xmm1, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu OWORD PTR [esp+80], xmm5
+ jmp L_AES_GCM_decrypt_avx1_iv_done
+L_AES_GCM_decrypt_avx1_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm1, OWORD PTR [ebp]
+ vaesenc xmm1, xmm1, [ebp+16]
+ vaesenc xmm1, xmm1, [ebp+32]
+ vaesenc xmm1, xmm1, [ebp+48]
+ vaesenc xmm1, xmm1, [ebp+64]
+ vaesenc xmm1, xmm1, [ebp+80]
+ vaesenc xmm1, xmm1, [ebp+96]
+ vaesenc xmm1, xmm1, [ebp+112]
+ vaesenc xmm1, xmm1, [ebp+128]
+ vaesenc xmm1, xmm1, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm1, xmm1, xmm5
+ vaesenc xmm1, xmm1, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm1, xmm1, xmm5
+ vaesenc xmm1, xmm1, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm1, xmm1, xmm5
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_decrypt_avx1_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx1_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
+ vmovdqu xmm4, OWORD PTR [esi+ecx]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx1_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+224]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx1_calc_iv_done
+L_AES_GCM_decrypt_avx1_calc_iv_lt16:
+ sub esp, 16
+ vpxor xmm4, xmm4, xmm4
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_avx1_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx1_calc_iv_loop
+ vmovdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+L_AES_GCM_decrypt_avx1_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm4, xmm4, xmm4
+ shl edx, 3
+ vpinsrd xmm4, xmm4, edx, 0
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm0, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm0, 17
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm0, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm0, xmm0, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm0, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm0, xmm0, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm0, xmm0, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm0, xmm0, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm0, xmm0, xmm6
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm4, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm0
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vaesenc xmm4, xmm4, [ebp+96]
+ vaesenc xmm4, xmm4, [ebp+112]
+ vaesenc xmm4, xmm4, [ebp+128]
+ vaesenc xmm4, xmm4, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm5
+ vmovdqu OWORD PTR [esp+80], xmm4
+L_AES_GCM_decrypt_avx1_iv_done:
+ mov esi, DWORD PTR [esp+204]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+220]
+ cmp edx, 0
+ je L_AES_GCM_decrypt_avx1_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx1_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
+ vmovdqu xmm4, OWORD PTR [esi+ecx]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm2, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ vpclmulqdq xmm4, xmm1, xmm2, 0
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm2, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm2, xmm2, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm2, xmm2, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm2, xmm2, xmm6
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx1_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+220]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx1_calc_aad_done
+L_AES_GCM_decrypt_avx1_calc_aad_lt16:
+ sub esp, 16
+ vpxor xmm4, xmm4, xmm4
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_avx1_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx1_calc_aad_loop
+ vmovdqu xmm4, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_avx
+ vpshufd xmm5, xmm2, 78
+ vpshufd xmm6, xmm1, 78
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ vpclmulqdq xmm4, xmm1, xmm2, 0
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vmovdqa xmm3, xmm4
+ vmovdqa xmm2, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpsrld xmm4, xmm3, 31
+ vpsrld xmm5, xmm2, 31
+ vpslld xmm3, xmm3, 1
+ vpslld xmm2, xmm2, 1
+ vpsrldq xmm6, xmm4, 12
+ vpslldq xmm4, xmm4, 4
+ vpslldq xmm5, xmm5, 4
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm4
+ vpor xmm2, xmm2, xmm5
+ vpslld xmm4, xmm3, 31
+ vpslld xmm5, xmm3, 30
+ vpslld xmm6, xmm3, 25
+ vpxor xmm4, xmm4, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vmovdqa xmm5, xmm4
+ vpsrldq xmm5, xmm5, 4
+ vpslldq xmm4, xmm4, 12
+ vpxor xmm3, xmm3, xmm4
+ vpsrld xmm6, xmm3, 1
+ vpsrld xmm7, xmm3, 2
+ vpsrld xmm4, xmm3, 7
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm2, xmm2, xmm6
+L_AES_GCM_decrypt_avx1_calc_aad_done:
+ vmovdqu OWORD PTR [esp+96], xmm2
+ mov esi, DWORD PTR [esp+196]
+ mov edi, DWORD PTR [esp+200]
+ ; Calculate counter and H
+ vpsrlq xmm5, xmm1, 63
+ vpsllq xmm4, xmm1, 1
+ vpslldq xmm5, xmm5, 8
+ vpor xmm4, xmm4, xmm5
+ vpshufd xmm1, xmm1, 255
+ vpsrad xmm1, xmm1, 31
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpand xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpaddd xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+64], xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+216], 64
+ mov eax, DWORD PTR [esp+216]
+ jl L_AES_GCM_decrypt_avx1_done_64
+ and eax, 4294967232
+ vmovdqa xmm6, xmm2
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm1
+ ; H ^ 2
+ vpclmulqdq xmm4, xmm1, xmm1, 0
+ vpclmulqdq xmm0, xmm1, xmm1, 17
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm0, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm0, 78
+ vpclmulqdq xmm7, xmm0, xmm1, 17
+ vpclmulqdq xmm4, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm0
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm3, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [esp+32], xmm3
+ ; H ^ 4
+ vpclmulqdq xmm4, xmm0, xmm0, 0
+ vpclmulqdq xmm3, xmm0, xmm0, 17
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm3
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_avx1_ghash_64
+L_AES_GCM_decrypt_avx1_ghash_64_inplace:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm3
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm5, xmm5, xmm3
+ vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm6, xmm6, xmm3
+ vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm7, xmm7, xmm3
+ vpshufb xmm4, xmm4, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm4, xmm4, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx]
+ vmovdqu xmm1, OWORD PTR [ecx+16]
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu OWORD PTR [esp+112], xmm0
+ vmovdqu OWORD PTR [esp+128], xmm1
+ vmovdqu OWORD PTR [edx], xmm4
+ vmovdqu OWORD PTR [edx+16], xmm5
+ vaesenclast xmm6, xmm6, xmm3
+ vaesenclast xmm7, xmm7, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx+32]
+ vmovdqu xmm1, OWORD PTR [ecx+48]
+ vpxor xmm6, xmm6, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [esp+144], xmm0
+ vmovdqu OWORD PTR [esp+160], xmm1
+ vmovdqu OWORD PTR [edx+32], xmm6
+ vmovdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ vmovdqu xmm6, OWORD PTR [esp+96]
+ vmovdqu xmm3, OWORD PTR [esp+48]
+ vmovdqu xmm4, OWORD PTR [esp+112]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm5, xmm3, 78
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm7, xmm4, xmm3, 17
+ vpclmulqdq xmm6, xmm4, xmm3, 0
+ vpclmulqdq xmm5, xmm5, xmm1, 0
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vmovdqu xmm3, OWORD PTR [esp+32]
+ vmovdqu xmm4, OWORD PTR [esp+128]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp+16]
+ vmovdqu xmm4, OWORD PTR [esp+144]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [esp+160]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpslld xmm3, xmm6, 31
+ vpslld xmm0, xmm6, 30
+ vpslld xmm1, xmm6, 25
+ vpxor xmm3, xmm3, xmm0
+ vpxor xmm3, xmm3, xmm1
+ vpsrldq xmm0, xmm3, 4
+ vpslldq xmm3, xmm3, 12
+ vpxor xmm6, xmm6, xmm3
+ vpsrld xmm1, xmm6, 1
+ vpsrld xmm5, xmm6, 2
+ vpsrld xmm4, xmm6, 7
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx1_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_avx1_ghash_64_done
+L_AES_GCM_decrypt_avx1_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm3
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm5, xmm5, xmm3
+ vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm6, xmm6, xmm3
+ vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm7, xmm7, xmm3
+ vpshufb xmm4, xmm4, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+16]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+32]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+48]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+64]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+80]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+96]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+112]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+128]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+144]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm3, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+176]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm3, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+208]
+ vaesenc xmm4, xmm4, xmm3
+ vaesenc xmm5, xmm5, xmm3
+ vaesenc xmm6, xmm6, xmm3
+ vaesenc xmm7, xmm7, xmm3
+ vmovdqa xmm3, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm4, xmm4, xmm3
+ vaesenclast xmm5, xmm5, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx]
+ vmovdqu xmm1, OWORD PTR [ecx+16]
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu OWORD PTR [edx], xmm4
+ vmovdqu OWORD PTR [edx+16], xmm5
+ vaesenclast xmm6, xmm6, xmm3
+ vaesenclast xmm7, xmm7, xmm3
+ vmovdqu xmm0, OWORD PTR [ecx+32]
+ vmovdqu xmm1, OWORD PTR [ecx+48]
+ vpxor xmm6, xmm6, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [edx+32], xmm6
+ vmovdqu OWORD PTR [edx+48], xmm7
+ ; ghash encrypted counter
+ vmovdqu xmm6, OWORD PTR [esp+96]
+ vmovdqu xmm3, OWORD PTR [esp+48]
+ vmovdqu xmm4, OWORD PTR [ecx]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm5, xmm3, 78
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm7, xmm4, xmm3, 17
+ vpclmulqdq xmm6, xmm4, xmm3, 0
+ vpclmulqdq xmm5, xmm5, xmm1, 0
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vmovdqu xmm3, OWORD PTR [esp+32]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp+16]
+ vmovdqu xmm4, OWORD PTR [ecx+32]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vmovdqu xmm3, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpshufd xmm0, xmm3, 78
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm3
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm4
+ vpclmulqdq xmm2, xmm4, xmm3, 17
+ vpclmulqdq xmm3, xmm4, xmm3, 0
+ vpclmulqdq xmm0, xmm0, xmm1, 0
+ vpxor xmm5, xmm5, xmm3
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm5, xmm5, xmm0
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpslld xmm3, xmm6, 31
+ vpslld xmm0, xmm6, 30
+ vpslld xmm1, xmm6, 25
+ vpxor xmm3, xmm3, xmm0
+ vpxor xmm3, xmm3, xmm1
+ vpsrldq xmm0, xmm3, 4
+ vpslldq xmm3, xmm3, 12
+ vpxor xmm6, xmm6, xmm3
+ vpsrld xmm1, xmm6, 1
+ vpsrld xmm5, xmm6, 2
+ vpsrld xmm4, xmm6, 7
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu OWORD PTR [esp+96], xmm6
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx1_ghash_64
+L_AES_GCM_decrypt_avx1_ghash_64_done:
+ vmovdqa xmm2, xmm6
+ vmovdqu xmm1, OWORD PTR [esp]
+L_AES_GCM_decrypt_avx1_done_64:
+ mov edx, DWORD PTR [esp+216]
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_avx1_done_dec
+ mov eax, DWORD PTR [esp+216]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_avx1_last_block_done
+L_AES_GCM_decrypt_avx1_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm7, OWORD PTR [ecx]
+ pshufb xmm7, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ pxor xmm7, xmm2
+ vmovdqu xmm5, OWORD PTR [esp+64]
+ vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm5
+ vpxor xmm4, xmm4, [ebp]
+ vpclmulqdq xmm0, xmm7, xmm1, 16
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vpclmulqdq xmm3, xmm7, xmm1, 1
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vpclmulqdq xmm5, xmm7, xmm1, 17
+ vaesenc xmm4, xmm4, [ebp+96]
+ vpxor xmm0, xmm0, xmm3
+ vpslldq xmm6, xmm0, 8
+ vpsrldq xmm0, xmm0, 8
+ vaesenc xmm4, xmm4, [ebp+112]
+ vpclmulqdq xmm3, xmm7, xmm1, 0
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm0
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpclmulqdq xmm3, xmm6, xmm7, 16
+ vaesenc xmm4, xmm4, [ebp+128]
+ vpshufd xmm0, xmm6, 78
+ vpxor xmm0, xmm0, xmm3
+ vpclmulqdq xmm3, xmm0, xmm7, 16
+ vaesenc xmm4, xmm4, [ebp+144]
+ vpshufd xmm2, xmm0, 78
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm5
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+ vaesenc xmm4, xmm4, xmm5
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
+ vaesenclast xmm4, xmm4, xmm5
+ vmovdqu xmm5, OWORD PTR [ecx]
+ vpxor xmm4, xmm4, xmm5
+ vmovdqu OWORD PTR [edx], xmm4
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx1_last_block_start
+L_AES_GCM_decrypt_avx1_last_block_done:
+ mov ecx, DWORD PTR [esp+216]
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpxor xmm0, xmm0, [ebp]
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqa xmm5, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm5
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqa xmm5, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm5
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm5, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm5
+ sub esp, 32
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+ vpxor xmm4, xmm4, xmm4
+ vmovdqu OWORD PTR [esp+16], xmm4
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ mov BYTE PTR [esp+ecx+16], al
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ebx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ add esp, 32
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm2, xmm2, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm2, 78
+ vpclmulqdq xmm7, xmm2, xmm1, 17
+ vpclmulqdq xmm4, xmm2, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm2
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm2, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_avx1_done_dec:
+ mov esi, DWORD PTR [esp+212]
+ mov ebp, DWORD PTR [esp+228]
+ mov edx, DWORD PTR [esp+216]
+ mov ecx, DWORD PTR [esp+220]
+ shl edx, 3
+ shl ecx, 3
+ vpinsrd xmm4, xmm4, edx, 0
+ vpinsrd xmm4, xmm4, ecx, 2
+ mov edx, DWORD PTR [esp+216]
+ mov ecx, DWORD PTR [esp+220]
+ shr edx, 29
+ shr ecx, 29
+ vpinsrd xmm4, xmm4, edx, 1
+ vpinsrd xmm4, xmm4, ecx, 3
+ vpxor xmm2, xmm2, xmm4
+ ; ghash_gfmul_red_avx
+ vpshufd xmm5, xmm1, 78
+ vpshufd xmm6, xmm2, 78
+ vpclmulqdq xmm7, xmm2, xmm1, 17
+ vpclmulqdq xmm4, xmm2, xmm1, 0
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm6, xmm6, xmm2
+ vpclmulqdq xmm5, xmm5, xmm6, 0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm5, xmm5, xmm7
+ vpslldq xmm6, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm6
+ vpxor xmm2, xmm7, xmm5
+ vpslld xmm5, xmm4, 31
+ vpslld xmm6, xmm4, 30
+ vpslld xmm7, xmm4, 25
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm7
+ vpsrldq xmm7, xmm5, 4
+ vpslldq xmm5, xmm5, 12
+ vpxor xmm4, xmm4, xmm5
+ vpsrld xmm5, xmm4, 1
+ vpsrld xmm6, xmm4, 2
+ vpxor xmm5, xmm5, xmm6
+ vpxor xmm5, xmm5, xmm4
+ vpsrld xmm4, xmm4, 7
+ vpxor xmm5, xmm5, xmm7
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpshufb xmm2, xmm2, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm2, [esp+80]
+ mov edi, DWORD PTR [esp+240]
+ cmp ebp, 16
+ je L_AES_GCM_decrypt_avx1_cmp_tag_16
+ sub esp, 16
+ xor ecx, ecx
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm4
+L_AES_GCM_decrypt_avx1_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ xor al, BYTE PTR [esi+ecx]
+ or bl, al
+ inc ecx
+ cmp ecx, ebp
+ jne L_AES_GCM_decrypt_avx1_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add esp, 16
+ xor ecx, ecx
+ jmp L_AES_GCM_decrypt_avx1_cmp_tag_done
+L_AES_GCM_decrypt_avx1_cmp_tag_16:
+ vmovdqu xmm5, OWORD PTR [esi]
+ vpcmpeqb xmm4, xmm4, xmm5
+ vpmovmskb edx, xmm4
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_avx1_cmp_tag_done:
+ mov DWORD PTR [edi], ebx
+ add esp, 176
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_avx1 ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_AESGCM_STREAM
+_TEXT SEGMENT READONLY PARA
+AES_GCM_init_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+36]
+ mov esi, DWORD PTR [esp+44]
+ mov edi, DWORD PTR [esp+60]
+ vpxor xmm4, xmm4, xmm4
+ mov edx, DWORD PTR [esp+48]
+ cmp edx, 12
+ jne L_AES_GCM_init_avx1_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vpinsrd xmm4, xmm4, DWORD PTR [esi], 0
+ vpinsrd xmm4, xmm4, DWORD PTR [esi+4], 1
+ vpinsrd xmm4, xmm4, DWORD PTR [esi+8], 2
+ vpinsrd xmm4, xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm5, OWORD PTR [ebp]
+ vpxor xmm1, xmm4, xmm5
+ vmovdqa xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp DWORD PTR [esp+40], 11
+ vmovdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx1_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp DWORD PTR [esp+40], 13
+ vmovdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx1_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx1_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu OWORD PTR [edi], xmm1
+ jmp L_AES_GCM_init_avx1_iv_done
+L_AES_GCM_init_avx1_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm5, OWORD PTR [ebp]
+ vaesenc xmm5, xmm5, [ebp+16]
+ vaesenc xmm5, xmm5, [ebp+32]
+ vaesenc xmm5, xmm5, [ebp+48]
+ vaesenc xmm5, xmm5, [ebp+64]
+ vaesenc xmm5, xmm5, [ebp+80]
+ vaesenc xmm5, xmm5, [ebp+96]
+ vaesenc xmm5, xmm5, [ebp+112]
+ vaesenc xmm5, xmm5, [ebp+128]
+ vaesenc xmm5, xmm5, [ebp+144]
+ cmp DWORD PTR [esp+40], 11
+ vmovdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm1
+ vaesenc xmm5, xmm5, [ebp+176]
+ cmp DWORD PTR [esp+40], 13
+ vmovdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm1
+ vaesenc xmm5, xmm5, [ebp+208]
+ vmovdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm1
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_init_avx1_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_init_avx1_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_init_avx1_calc_iv_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx1_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+48]
+ cmp ecx, edx
+ je L_AES_GCM_init_avx1_calc_iv_done
+L_AES_GCM_init_avx1_calc_iv_lt16:
+ sub esp, 16
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_init_avx1_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx1_calc_iv_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ add esp, 16
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+L_AES_GCM_init_avx1_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm0, OWORD PTR [ebp]
+ vpxor xmm0, xmm0, xmm4
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+40], 11
+ vmovdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+40], 13
+ vmovdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edi], xmm0
+L_AES_GCM_init_avx1_iv_done:
+ mov ebp, DWORD PTR [esp+52]
+ mov edi, DWORD PTR [esp+56]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqa OWORD PTR [ebp], xmm5
+ vmovdqa OWORD PTR [edi], xmm4
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_init_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_aad_update_avx1 PROC
+ push esi
+ push edi
+ mov esi, DWORD PTR [esp+12]
+ mov edx, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov eax, DWORD PTR [esp+24]
+ vmovdqa xmm5, OWORD PTR [edi]
+ vmovdqa xmm6, OWORD PTR [eax]
+ xor ecx, ecx
+L_AES_GCM_aad_update_avx1_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm5, xmm5, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_aad_update_avx1_16_loop
+ vmovdqa OWORD PTR [edi], xmm5
+ pop edi
+ pop esi
+ ret
+AES_GCM_aad_update_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_block_avx1 PROC
+ push esi
+ push edi
+ mov ecx, DWORD PTR [esp+12]
+ mov eax, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov esi, DWORD PTR [esp+24]
+ mov edx, DWORD PTR [esp+28]
+ vmovdqu xmm1, OWORD PTR [edx]
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [edx], xmm1
+ vpxor xmm0, xmm0, [ecx]
+ vaesenc xmm0, xmm0, [ecx+16]
+ vaesenc xmm0, xmm0, [ecx+32]
+ vaesenc xmm0, xmm0, [ecx+48]
+ vaesenc xmm0, xmm0, [ecx+64]
+ vaesenc xmm0, xmm0, [ecx+80]
+ vaesenc xmm0, xmm0, [ecx+96]
+ vaesenc xmm0, xmm0, [ecx+112]
+ vaesenc xmm0, xmm0, [ecx+128]
+ vaesenc xmm0, xmm0, [ecx+144]
+ cmp eax, 11
+ vmovdqa xmm1, OWORD PTR [ecx+160]
+ jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ecx+176]
+ cmp eax, 13
+ vmovdqa xmm1, OWORD PTR [ecx+192]
+ jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ecx+208]
+ vmovdqa xmm1, OWORD PTR [ecx+224]
+L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [esi]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edi], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_block_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_ghash_block_avx1 PROC
+ mov edx, DWORD PTR [esp+4]
+ mov eax, DWORD PTR [esp+8]
+ mov ecx, DWORD PTR [esp+12]
+ vmovdqa xmm4, OWORD PTR [eax]
+ vmovdqa xmm5, OWORD PTR [ecx]
+ vmovdqu xmm0, OWORD PTR [edx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ vmovdqa OWORD PTR [eax], xmm4
+ ret
+AES_GCM_ghash_block_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_update_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 96
+ mov esi, DWORD PTR [esp+144]
+ vmovdqa xmm4, OWORD PTR [esi]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ mov esi, DWORD PTR [esp+136]
+ mov ebp, DWORD PTR [esp+140]
+ vmovdqa xmm6, OWORD PTR [esi]
+ vmovdqa xmm5, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+80], xmm6
+ mov ebp, DWORD PTR [esp+116]
+ mov edi, DWORD PTR [esp+124]
+ mov esi, DWORD PTR [esp+128]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+132], 64
+ mov eax, DWORD PTR [esp+132]
+ jl L_AES_GCM_encrypt_update_avx1_done_64
+ and eax, 4294967232
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm0, xmm5, xmm5, 0
+ vpclmulqdq xmm4, xmm5, xmm5, 17
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vmovdqu OWORD PTR [esp+16], xmm4
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm4, 78
+ vpclmulqdq xmm3, xmm4, xmm5, 17
+ vpclmulqdq xmm0, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm4
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm7, xmm3, xmm1
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [esp+32], xmm7
+ ; H ^ 4
+ vpclmulqdq xmm0, xmm4, xmm4, 0
+ vpclmulqdq xmm7, xmm4, xmm4, 17
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [esp+48], xmm7
+ ; First 64 bytes of input
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm7
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ vmovdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ vmovdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vmovdqu xmm4, OWORD PTR [esi]
+ vmovdqu xmm5, OWORD PTR [esi+16]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm5
+ vmovdqu OWORD PTR [edi], xmm0
+ vmovdqu OWORD PTR [edi+16], xmm1
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm4, OWORD PTR [esi+32]
+ vmovdqu xmm5, OWORD PTR [esi+48]
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [edi+32], xmm2
+ vmovdqu OWORD PTR [edi+48], xmm3
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_update_avx1_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_update_avx1_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm7
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ vmovdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ vmovdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx]
+ vmovdqu xmm5, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm5
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx+32]
+ vmovdqu xmm5, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ vmovdqu xmm2, OWORD PTR [esp+80]
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ vmovdqu xmm0, OWORD PTR [edx+-64]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm2
+ vpshufd xmm1, xmm7, 78
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm3, xmm0, xmm7, 17
+ vpclmulqdq xmm2, xmm0, xmm7, 0
+ vpclmulqdq xmm1, xmm1, xmm5, 0
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ vmovdqu xmm0, OWORD PTR [edx+-48]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ vmovdqu xmm0, OWORD PTR [edx+-32]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp]
+ vmovdqu xmm0, OWORD PTR [edx+-16]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vpslldq xmm5, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm3, xmm3, xmm1
+ vpslld xmm7, xmm2, 31
+ vpslld xmm4, xmm2, 30
+ vpslld xmm5, xmm2, 25
+ vpxor xmm7, xmm7, xmm4
+ vpxor xmm7, xmm7, xmm5
+ vpsrldq xmm4, xmm7, 4
+ vpslldq xmm7, xmm7, 12
+ vpxor xmm2, xmm2, xmm7
+ vpsrld xmm5, xmm2, 1
+ vpsrld xmm1, xmm2, 2
+ vpsrld xmm0, xmm2, 7
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm2, xmm2, xmm3
+ vmovdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_avx1_ghash_64
+L_AES_GCM_encrypt_update_avx1_end_64:
+ movdqu xmm6, OWORD PTR [esp+80]
+ ; Block 1
+ vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm5, OWORD PTR [edx]
+ pshufb xmm5, xmm0
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ pxor xmm5, xmm6
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm7, 78
+ vpclmulqdq xmm3, xmm7, xmm5, 17
+ vpclmulqdq xmm0, xmm7, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm7
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm6, xmm6, xmm1
+ ; Block 2
+ vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm5, OWORD PTR [edx+16]
+ pshufb xmm5, xmm0
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm7, 78
+ vpclmulqdq xmm3, xmm7, xmm5, 17
+ vpclmulqdq xmm0, xmm7, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm7
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm6, xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm6, xmm6, xmm1
+ ; Block 3
+ vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm5, OWORD PTR [edx+32]
+ pshufb xmm5, xmm0
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm7, 78
+ vpclmulqdq xmm3, xmm7, xmm5, 17
+ vpclmulqdq xmm0, xmm7, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm7
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm6, xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm6, xmm6, xmm1
+ ; Block 4
+ vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vmovdqu xmm5, OWORD PTR [edx+48]
+ pshufb xmm5, xmm0
+ vmovdqu xmm7, OWORD PTR [esp]
+ ; ghash_gfmul_xor_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm7, 78
+ vpclmulqdq xmm3, xmm7, xmm5, 17
+ vpclmulqdq xmm0, xmm7, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm7
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm4, xmm4, xmm0
+ vpxor xmm6, xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm6, xmm6, xmm2
+ vmovdqu xmm5, OWORD PTR [esp]
+L_AES_GCM_encrypt_update_avx1_done_64:
+ mov edx, DWORD PTR [esp+132]
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_update_avx1_done_enc
+ mov eax, DWORD PTR [esp+132]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_avx1_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm1, OWORD PTR [esp+64]
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm1
+ vpxor xmm0, xmm0, [ebp]
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+120], 11
+ vmovdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+120], 13
+ vmovdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edx], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_avx1_last_block_ghash
+L_AES_GCM_encrypt_update_avx1_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm1, OWORD PTR [esp+64]
+ vmovdqu xmm3, xmm6
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm1
+ vpxor xmm0, xmm0, [ebp]
+ vpclmulqdq xmm4, xmm3, xmm5, 16
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vpclmulqdq xmm7, xmm3, xmm5, 1
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vpclmulqdq xmm1, xmm3, xmm5, 17
+ vaesenc xmm0, xmm0, [ebp+96]
+ vpxor xmm4, xmm4, xmm7
+ vpslldq xmm2, xmm4, 8
+ vpsrldq xmm4, xmm4, 8
+ vaesenc xmm0, xmm0, [ebp+112]
+ vpclmulqdq xmm7, xmm3, xmm5, 0
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpclmulqdq xmm7, xmm2, xmm3, 16
+ vaesenc xmm0, xmm0, [ebp+128]
+ vpshufd xmm4, xmm2, 78
+ vpxor xmm4, xmm4, xmm7
+ vpclmulqdq xmm7, xmm4, xmm3, 16
+ vaesenc xmm0, xmm0, [ebp+144]
+ vpshufd xmm6, xmm4, 78
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm1
+ cmp DWORD PTR [esp+120], 11
+ vmovdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+120], 13
+ vmovdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edx], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ add ebx, 16
+ vpxor xmm6, xmm6, xmm0
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_avx1_last_block_start
+L_AES_GCM_encrypt_update_avx1_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm6, xmm3, xmm1
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+L_AES_GCM_encrypt_update_avx1_last_block_done:
+L_AES_GCM_encrypt_update_avx1_done_enc:
+ mov esi, DWORD PTR [esp+136]
+ mov edi, DWORD PTR [esp+144]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqa OWORD PTR [esi], xmm6
+ vmovdqu OWORD PTR [edi], xmm4
+ add esp, 96
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_update_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_final_avx1 PROC
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+32]
+ mov esi, DWORD PTR [esp+52]
+ mov edi, DWORD PTR [esp+56]
+ vmovdqa xmm4, OWORD PTR [ebp]
+ vmovdqa xmm5, OWORD PTR [esi]
+ vmovdqa xmm6, OWORD PTR [edi]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ mov edx, DWORD PTR [esp+44]
+ mov ecx, DWORD PTR [esp+48]
+ shl edx, 3
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov edx, DWORD PTR [esp+44]
+ mov ecx, DWORD PTR [esp+48]
+ shr edx, 29
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, edx, 1
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm4, 78
+ vpclmulqdq xmm3, xmm4, xmm5, 17
+ vpclmulqdq xmm0, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm4
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm4, xmm3, xmm1
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm4, xmm6
+ mov edi, DWORD PTR [esp+36]
+ cmp DWORD PTR [esp+40], 16
+ je L_AES_GCM_encrypt_final_avx1_store_tag_16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_final_avx1_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+40]
+ jne L_AES_GCM_encrypt_final_avx1_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_avx1_store_tag_done
+L_AES_GCM_encrypt_final_avx1_store_tag_16:
+ vmovdqu OWORD PTR [edi], xmm0
+L_AES_GCM_encrypt_final_avx1_store_tag_done:
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_final_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_update_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 160
+ mov esi, DWORD PTR [esp+208]
+ vmovdqa xmm4, OWORD PTR [esi]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ mov esi, DWORD PTR [esp+200]
+ mov ebp, DWORD PTR [esp+204]
+ vmovdqa xmm6, OWORD PTR [esi]
+ vmovdqa xmm5, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+80], xmm6
+ mov ebp, DWORD PTR [esp+180]
+ mov edi, DWORD PTR [esp+188]
+ mov esi, DWORD PTR [esp+192]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+196], 64
+ mov eax, DWORD PTR [esp+196]
+ jl L_AES_GCM_decrypt_update_avx1_done_64
+ and eax, 4294967232
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm0, xmm5, xmm5, 0
+ vpclmulqdq xmm4, xmm5, xmm5, 17
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vmovdqu OWORD PTR [esp+16], xmm4
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm4, 78
+ vpclmulqdq xmm3, xmm4, xmm5, 17
+ vpclmulqdq xmm0, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm4
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm7, xmm3, xmm1
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [esp+32], xmm7
+ ; H ^ 4
+ vpclmulqdq xmm0, xmm4, xmm4, 0
+ vpclmulqdq xmm7, xmm4, xmm4, 17
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm7, xmm7, xmm1
+ vmovdqu OWORD PTR [esp+48], xmm7
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_update_avx1_ghash_64
+L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm7
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ vmovdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ vmovdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx]
+ vmovdqu xmm5, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm5
+ vmovdqu OWORD PTR [esp+96], xmm4
+ vmovdqu OWORD PTR [esp+112], xmm5
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx+32]
+ vmovdqu xmm5, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [esp+128], xmm4
+ vmovdqu OWORD PTR [esp+144], xmm5
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ vmovdqu xmm2, OWORD PTR [esp+80]
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ vmovdqu xmm0, OWORD PTR [esp+96]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm2
+ vpshufd xmm1, xmm7, 78
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm3, xmm0, xmm7, 17
+ vpclmulqdq xmm2, xmm0, xmm7, 0
+ vpclmulqdq xmm1, xmm1, xmm5, 0
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ vmovdqu xmm0, OWORD PTR [esp+112]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ vmovdqu xmm0, OWORD PTR [esp+128]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp]
+ vmovdqu xmm0, OWORD PTR [esp+144]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vpslldq xmm5, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm3, xmm3, xmm1
+ vpslld xmm7, xmm2, 31
+ vpslld xmm4, xmm2, 30
+ vpslld xmm5, xmm2, 25
+ vpxor xmm7, xmm7, xmm4
+ vpxor xmm7, xmm7, xmm5
+ vpsrldq xmm4, xmm7, 4
+ vpslldq xmm7, xmm7, 12
+ vpxor xmm2, xmm2, xmm7
+ vpsrld xmm5, xmm2, 1
+ vpsrld xmm1, xmm2, 2
+ vpsrld xmm0, xmm2, 7
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm2, xmm2, xmm3
+ vmovdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx1_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_update_avx1_ghash_64_done
+L_AES_GCM_decrypt_update_avx1_ghash_64:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm0, OWORD PTR [esp+64]
+ vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four
+ vmovdqu OWORD PTR [esp+64], xmm7
+ vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three
+ vpshufb xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ vmovdqa xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ vmovdqa xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqa xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx]
+ vmovdqu xmm5, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm5
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm4, OWORD PTR [ecx+32]
+ vmovdqu xmm5, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; ghash encrypted counter
+ vmovdqu xmm2, OWORD PTR [esp+80]
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ vmovdqu xmm0, OWORD PTR [ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm0, xmm2
+ vpshufd xmm1, xmm7, 78
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm3, xmm0, xmm7, 17
+ vpclmulqdq xmm2, xmm0, xmm7, 0
+ vpclmulqdq xmm1, xmm1, xmm5, 0
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ vmovdqu xmm0, OWORD PTR [ecx+16]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ vmovdqu xmm0, OWORD PTR [ecx+32]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu xmm7, OWORD PTR [esp]
+ vmovdqu xmm0, OWORD PTR [ecx+48]
+ vpshufd xmm4, xmm7, 78
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ vpshufd xmm5, xmm0, 78
+ vpxor xmm5, xmm5, xmm0
+ vpclmulqdq xmm6, xmm0, xmm7, 17
+ vpclmulqdq xmm7, xmm0, xmm7, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 0
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm3, xmm3, xmm6
+ vpxor xmm1, xmm1, xmm4
+ vpslldq xmm5, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm3, xmm3, xmm1
+ vpslld xmm7, xmm2, 31
+ vpslld xmm4, xmm2, 30
+ vpslld xmm5, xmm2, 25
+ vpxor xmm7, xmm7, xmm4
+ vpxor xmm7, xmm7, xmm5
+ vpsrldq xmm4, xmm7, 4
+ vpslldq xmm7, xmm7, 12
+ vpxor xmm2, xmm2, xmm7
+ vpsrld xmm5, xmm2, 1
+ vpsrld xmm1, xmm2, 2
+ vpsrld xmm0, xmm2, 7
+ vpxor xmm5, xmm5, xmm1
+ vpxor xmm5, xmm5, xmm0
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm2, xmm2, xmm3
+ vmovdqu OWORD PTR [esp+80], xmm2
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx1_ghash_64
+L_AES_GCM_decrypt_update_avx1_ghash_64_done:
+ vmovdqa xmm6, xmm2
+ vmovdqu xmm5, OWORD PTR [esp]
+L_AES_GCM_decrypt_update_avx1_done_64:
+ mov edx, DWORD PTR [esp+196]
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_update_avx1_done_dec
+ mov eax, DWORD PTR [esp+196]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_update_avx1_last_block_done
+L_AES_GCM_decrypt_update_avx1_last_block_start:
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ vmovdqu xmm3, OWORD PTR [ecx]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm3, xmm3, xmm6
+ vmovdqu xmm1, OWORD PTR [esp+64]
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one
+ vmovdqu OWORD PTR [esp+64], xmm1
+ vpxor xmm0, xmm0, [ebp]
+ vpclmulqdq xmm4, xmm3, xmm5, 16
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vpclmulqdq xmm7, xmm3, xmm5, 1
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vpclmulqdq xmm1, xmm3, xmm5, 17
+ vaesenc xmm0, xmm0, [ebp+96]
+ vpxor xmm4, xmm4, xmm7
+ vpslldq xmm2, xmm4, 8
+ vpsrldq xmm4, xmm4, 8
+ vaesenc xmm0, xmm0, [ebp+112]
+ vpclmulqdq xmm7, xmm3, xmm5, 0
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpclmulqdq xmm7, xmm2, xmm3, 16
+ vaesenc xmm0, xmm0, [ebp+128]
+ vpshufd xmm4, xmm2, 78
+ vpxor xmm4, xmm4, xmm7
+ vpclmulqdq xmm7, xmm4, xmm3, 16
+ vaesenc xmm0, xmm0, [ebp+144]
+ vpshufd xmm6, xmm4, 78
+ vpxor xmm6, xmm6, xmm7
+ vpxor xmm6, xmm6, xmm1
+ cmp DWORD PTR [esp+184], 11
+ vmovdqa xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+184], 13
+ vmovdqa xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqa xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edx], xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx1_last_block_start
+L_AES_GCM_decrypt_update_avx1_last_block_done:
+L_AES_GCM_decrypt_update_avx1_done_dec:
+ mov esi, DWORD PTR [esp+200]
+ mov edi, DWORD PTR [esp+208]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqa OWORD PTR [esi], xmm6
+ vmovdqu OWORD PTR [edi], xmm4
+ add esp, 160
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_update_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_final_avx1 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+36]
+ mov esi, DWORD PTR [esp+56]
+ mov edi, DWORD PTR [esp+60]
+ vmovdqa xmm6, OWORD PTR [ebp]
+ vmovdqa xmm5, OWORD PTR [esi]
+ vmovdqa xmm7, OWORD PTR [edi]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ mov edx, DWORD PTR [esp+48]
+ mov ecx, DWORD PTR [esp+52]
+ shl edx, 3
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov edx, DWORD PTR [esp+48]
+ mov ecx, DWORD PTR [esp+52]
+ shr edx, 29
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, edx, 1
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm6, xmm3, xmm1
+ vpslld xmm1, xmm0, 31
+ vpslld xmm2, xmm0, 30
+ vpslld xmm3, xmm0, 25
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpsrldq xmm3, xmm1, 4
+ vpslldq xmm1, xmm1, 12
+ vpxor xmm0, xmm0, xmm1
+ vpsrld xmm1, xmm0, 1
+ vpsrld xmm2, xmm0, 2
+ vpxor xmm1, xmm1, xmm2
+ vpxor xmm1, xmm1, xmm0
+ vpsrld xmm0, xmm0, 7
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vpshufb xmm6, xmm6, OWORD PTR L_aes_gcm_avx1_bswap_mask
+ vpxor xmm0, xmm6, xmm7
+ mov esi, DWORD PTR [esp+40]
+ mov edi, DWORD PTR [esp+64]
+ cmp DWORD PTR [esp+44], 16
+ je L_AES_GCM_decrypt_final_avx1_cmp_tag_16
+ sub esp, 16
+ xor ecx, ecx
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ xor al, BYTE PTR [esi+ecx]
+ or bl, al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+44]
+ jne L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add esp, 16
+ xor ecx, ecx
+ jmp L_AES_GCM_decrypt_final_avx1_cmp_tag_done
+L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [esi]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb edx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
+ mov DWORD PTR [edi], ebx
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_final_avx1 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 112
+ mov esi, DWORD PTR [esp+144]
+ mov ebp, DWORD PTR [esp+168]
+ mov edx, DWORD PTR [esp+160]
+ vpxor xmm4, xmm4, xmm4
+ cmp edx, 12
+ je L_AES_GCM_encrypt_avx2_iv_12
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vaesenc xmm5, xmm5, [ebp+16]
+ vaesenc xmm5, xmm5, [ebp+32]
+ vaesenc xmm5, xmm5, [ebp+48]
+ vaesenc xmm5, xmm5, [ebp+64]
+ vaesenc xmm5, xmm5, [ebp+80]
+ vaesenc xmm5, xmm5, [ebp+96]
+ vaesenc xmm5, xmm5, [ebp+112]
+ vaesenc xmm5, xmm5, [ebp+128]
+ vaesenc xmm5, xmm5, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_encrypt_avx2_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx2_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx2_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+160]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx2_calc_iv_done
+L_AES_GCM_encrypt_avx2_calc_iv_lt16:
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_avx2_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx2_calc_iv_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+L_AES_GCM_encrypt_avx2_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Encrypt counter
+ vmovdqu xmm6, OWORD PTR [ebp]
+ vpxor xmm6, xmm6, xmm4
+ vaesenc xmm6, xmm6, [ebp+16]
+ vaesenc xmm6, xmm6, [ebp+32]
+ vaesenc xmm6, xmm6, [ebp+48]
+ vaesenc xmm6, xmm6, [ebp+64]
+ vaesenc xmm6, xmm6, [ebp+80]
+ vaesenc xmm6, xmm6, [ebp+96]
+ vaesenc xmm6, xmm6, [ebp+112]
+ vaesenc xmm6, xmm6, [ebp+128]
+ vaesenc xmm6, xmm6, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm0
+ vaesenc xmm6, xmm6, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm0
+ vaesenc xmm6, xmm6, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm6, xmm6, xmm0
+ jmp L_AES_GCM_encrypt_avx2_iv_done
+L_AES_GCM_encrypt_avx2_iv_12:
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vpblendd xmm4, xmm4, [esi], 7
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vpxor xmm6, xmm4, xmm5
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm6, xmm6, xmm7
+ vmovdqu xmm0, OWORD PTR [ebp+32]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+48]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+64]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+80]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+96]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+112]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+128]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+144]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+176]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+208]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vaesenclast xmm6, xmm6, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+L_AES_GCM_encrypt_avx2_iv_done:
+ vmovdqu OWORD PTR [esp+80], xmm6
+ vpxor xmm6, xmm6, xmm6
+ mov esi, DWORD PTR [esp+140]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+156]
+ cmp edx, 0
+ je L_AES_GCM_encrypt_avx2_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx2_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm6, 16
+ vpclmulqdq xmm1, xmm5, xmm6, 1
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm6, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx2_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+156]
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx2_calc_aad_done
+L_AES_GCM_encrypt_avx2_calc_aad_lt16:
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_avx2_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx2_calc_aad_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm6, 16
+ vpclmulqdq xmm1, xmm5, xmm6, 1
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm6, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+L_AES_GCM_encrypt_avx2_calc_aad_done:
+ mov esi, DWORD PTR [esp+132]
+ mov edi, DWORD PTR [esp+136]
+ ; Calculate counter and H
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+152], 64
+ mov eax, DWORD PTR [esp+152]
+ jl L_AES_GCM_encrypt_avx2_done_64
+ and eax, 4294967232
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vmovdqu OWORD PTR [esp+96], xmm6
+ vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ vmovdqu xmm2, xmm5
+ ; H ^ 2
+ vpclmulqdq xmm5, xmm2, xmm2, 0
+ vpclmulqdq xmm6, xmm2, xmm2, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red
+ vpclmulqdq xmm6, xmm2, xmm0, 16
+ vpclmulqdq xmm5, xmm2, xmm0, 1
+ vpclmulqdq xmm4, xmm2, xmm0, 0
+ vpxor xmm6, xmm6, xmm5
+ vpslldq xmm5, xmm6, 8
+ vpsrldq xmm6, xmm6, 8
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm1, xmm2, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm5, xmm0, xmm0, 0
+ vpclmulqdq xmm6, xmm0, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm2
+ vmovdqu xmm6, OWORD PTR [esp+96]
+ ; First 64 bytes of input
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [esi]
+ vmovdqu xmm4, OWORD PTR [esi+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edi], xmm0
+ vmovdqu OWORD PTR [edi+16], xmm1
+ vmovdqu xmm7, OWORD PTR [esi+32]
+ vmovdqu xmm4, OWORD PTR [esi+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [edi+32], xmm2
+ vmovdqu OWORD PTR [edi+48], xmm3
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_avx2_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_avx2_ghash_64:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [edx+-64]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [edx+-48]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [edx+-32]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [edx+-16]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_avx2_ghash_64
+L_AES_GCM_encrypt_avx2_end_64:
+ vmovdqu OWORD PTR [esp+96], xmm6
+ vmovdqu xmm3, OWORD PTR [edx+48]
+ vmovdqu xmm7, OWORD PTR [esp]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm5, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm4, xmm7, xmm3, 0
+ vpclmulqdq xmm6, xmm7, xmm3, 17
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu xmm3, OWORD PTR [edx+32]
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vmovdqu xmm3, OWORD PTR [edx+16]
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vmovdqu xmm0, OWORD PTR [esp+96]
+ vmovdqu xmm3, OWORD PTR [edx]
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm3, xmm3, xmm0
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vpslldq xmm7, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm7
+ vpxor xmm6, xmm6, xmm5
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm4, xmm2, 16
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vmovdqu xmm5, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+L_AES_GCM_encrypt_avx2_done_64:
+ cmp ebx, DWORD PTR [esp+152]
+ je L_AES_GCM_encrypt_avx2_done_enc
+ mov eax, DWORD PTR [esp+152]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_avx2_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_block
+ vmovdqu xmm1, xmm4
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one
+ vpxor xmm0, xmm0, [ebp]
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm2, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm2, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqu xmm2, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm2
+ vmovdqu xmm4, xmm1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edx], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_avx2_last_block_ghash
+L_AES_GCM_encrypt_avx2_last_block_start:
+ vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vmovdqu OWORD PTR [esp+64], xmm4
+ ; aesenc_gfmul_sb
+ vpclmulqdq xmm2, xmm6, xmm5, 1
+ vpclmulqdq xmm3, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 0
+ vpclmulqdq xmm4, xmm6, xmm5, 17
+ vpxor xmm7, xmm7, [ebp]
+ vaesenc xmm7, xmm7, [ebp+16]
+ vpxor xmm3, xmm3, xmm2
+ vpslldq xmm2, xmm3, 8
+ vpsrldq xmm3, xmm3, 8
+ vaesenc xmm7, xmm7, [ebp+32]
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+48]
+ vaesenc xmm7, xmm7, [ebp+64]
+ vaesenc xmm7, xmm7, [ebp+80]
+ vpshufd xmm2, xmm2, 78
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+96]
+ vaesenc xmm7, xmm7, [ebp+112]
+ vaesenc xmm7, xmm7, [ebp+128]
+ vpshufd xmm2, xmm2, 78
+ vaesenc xmm7, xmm7, [ebp+144]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm2, xmm2, xmm4
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ cmp DWORD PTR [esp+172], 11
+ jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+176]
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ cmp DWORD PTR [esp+172], 13
+ jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
+ vaesenclast xmm7, xmm7, xmm0
+ vmovdqu xmm3, OWORD PTR [esi+ebx]
+ vpxor xmm6, xmm2, xmm1
+ vpxor xmm7, xmm7, xmm3
+ vmovdqu OWORD PTR [edi+ebx], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_avx2_last_block_start
+L_AES_GCM_encrypt_avx2_last_block_ghash:
+ ; ghash_gfmul_red
+ vpclmulqdq xmm2, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 1
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm6, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm0
+L_AES_GCM_encrypt_avx2_last_block_done:
+ mov ecx, DWORD PTR [esp+152]
+ mov edx, DWORD PTR [esp+152]
+ and ecx, 15
+ jz L_AES_GCM_encrypt_avx2_done_enc
+ ; aesenc_last15_enc
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpxor xmm4, xmm4, [ebp]
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vaesenc xmm4, xmm4, [ebp+96]
+ vaesenc xmm4, xmm4, [ebp+112]
+ vaesenc xmm4, xmm4, [ebp+128]
+ vaesenc xmm4, xmm4, [ebp+144]
+ cmp DWORD PTR [esp+172], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm0
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+172], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm0
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm0
+ xor ecx, ecx
+ vpxor xmm0, xmm0, xmm0
+ vmovdqu OWORD PTR [esp], xmm4
+ vmovdqu OWORD PTR [esp+16], xmm0
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [esp+ecx+16], al
+ mov BYTE PTR [edi+ebx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu xmm4, OWORD PTR [esp+16]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm4
+ ; ghash_gfmul_red
+ vpclmulqdq xmm2, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 1
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm6, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm0
+L_AES_GCM_encrypt_avx2_done_enc:
+ vmovdqu xmm7, OWORD PTR [esp+80]
+ ; calc_tag
+ mov ecx, DWORD PTR [esp+152]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 0
+ mov ecx, DWORD PTR [esp+156]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov ecx, DWORD PTR [esp+152]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 1
+ mov ecx, DWORD PTR [esp+156]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm0, xmm0, xmm6
+ ; ghash_gfmul_red
+ vpclmulqdq xmm4, xmm0, xmm5, 16
+ vpclmulqdq xmm3, xmm0, xmm5, 1
+ vpclmulqdq xmm2, xmm0, xmm5, 0
+ vpxor xmm4, xmm4, xmm3
+ vpslldq xmm3, xmm4, 8
+ vpsrldq xmm4, xmm4, 8
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm0, xmm0, xmm5, 17
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm0, xmm0, xmm3
+ vpxor xmm0, xmm0, xmm2
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm0, xmm0, xmm7
+ mov edi, DWORD PTR [esp+148]
+ mov ebx, DWORD PTR [esp+164]
+ ; store_tag
+ cmp ebx, 16
+ je L_AES_GCM_encrypt_avx2_store_tag_16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_avx2_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, ebx
+ jne L_AES_GCM_encrypt_avx2_store_tag_loop
+ jmp L_AES_GCM_encrypt_avx2_store_tag_done
+L_AES_GCM_encrypt_avx2_store_tag_16:
+ vmovdqu OWORD PTR [edi], xmm0
+L_AES_GCM_encrypt_avx2_store_tag_done:
+ add esp, 112
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 176
+ mov esi, DWORD PTR [esp+208]
+ mov ebp, DWORD PTR [esp+232]
+ vpxor xmm4, xmm4, xmm4
+ mov edx, DWORD PTR [esp+224]
+ cmp edx, 12
+ je L_AES_GCM_decrypt_avx2_iv_12
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vaesenc xmm5, xmm5, [ebp+16]
+ vaesenc xmm5, xmm5, [ebp+32]
+ vaesenc xmm5, xmm5, [ebp+48]
+ vaesenc xmm5, xmm5, [ebp+64]
+ vaesenc xmm5, xmm5, [ebp+80]
+ vaesenc xmm5, xmm5, [ebp+96]
+ vaesenc xmm5, xmm5, [ebp+112]
+ vaesenc xmm5, xmm5, [ebp+128]
+ vaesenc xmm5, xmm5, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_decrypt_avx2_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx2_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx2_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+224]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx2_calc_iv_done
+L_AES_GCM_decrypt_avx2_calc_iv_lt16:
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_avx2_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx2_calc_iv_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+L_AES_GCM_decrypt_avx2_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Encrypt counter
+ vmovdqu xmm6, OWORD PTR [ebp]
+ vpxor xmm6, xmm6, xmm4
+ vaesenc xmm6, xmm6, [ebp+16]
+ vaesenc xmm6, xmm6, [ebp+32]
+ vaesenc xmm6, xmm6, [ebp+48]
+ vaesenc xmm6, xmm6, [ebp+64]
+ vaesenc xmm6, xmm6, [ebp+80]
+ vaesenc xmm6, xmm6, [ebp+96]
+ vaesenc xmm6, xmm6, [ebp+112]
+ vaesenc xmm6, xmm6, [ebp+128]
+ vaesenc xmm6, xmm6, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm0
+ vaesenc xmm6, xmm6, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm0
+ vaesenc xmm6, xmm6, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm6, xmm6, xmm0
+ jmp L_AES_GCM_decrypt_avx2_iv_done
+L_AES_GCM_decrypt_avx2_iv_12:
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vpblendd xmm4, xmm4, [esi], 7
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vpxor xmm6, xmm4, xmm5
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm6, xmm6, xmm7
+ vmovdqu xmm0, OWORD PTR [ebp+32]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+48]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+64]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+80]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+96]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+112]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+128]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+144]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+176]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+208]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm6, xmm6, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vaesenclast xmm6, xmm6, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+L_AES_GCM_decrypt_avx2_iv_done:
+ vmovdqu OWORD PTR [esp+80], xmm6
+ vpxor xmm6, xmm6, xmm6
+ mov esi, DWORD PTR [esp+204]
+ ; Additional authentication data
+ mov edx, DWORD PTR [esp+220]
+ cmp edx, 0
+ je L_AES_GCM_decrypt_avx2_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx2_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm6, 16
+ vpclmulqdq xmm1, xmm5, xmm6, 1
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm6, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx2_calc_aad_16_loop
+ mov edx, DWORD PTR [esp+220]
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx2_calc_aad_done
+L_AES_GCM_decrypt_avx2_calc_aad_lt16:
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_avx2_calc_aad_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx2_calc_aad_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm6, 16
+ vpclmulqdq xmm1, xmm5, xmm6, 1
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm7, xmm0, xmm1
+ vpxor xmm6, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm7, xmm2, 16
+ vpshufd xmm1, xmm7, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+L_AES_GCM_decrypt_avx2_calc_aad_done:
+ mov esi, DWORD PTR [esp+196]
+ mov edi, DWORD PTR [esp+200]
+ ; Calculate counter and H
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+216], 64
+ mov eax, DWORD PTR [esp+216]
+ jl L_AES_GCM_decrypt_avx2_done_64
+ and eax, 4294967232
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vmovdqu OWORD PTR [esp+96], xmm6
+ vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ vmovdqu xmm2, xmm5
+ ; H ^ 2
+ vpclmulqdq xmm5, xmm2, xmm2, 0
+ vpclmulqdq xmm6, xmm2, xmm2, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red
+ vpclmulqdq xmm6, xmm2, xmm0, 16
+ vpclmulqdq xmm5, xmm2, xmm0, 1
+ vpclmulqdq xmm4, xmm2, xmm0, 0
+ vpxor xmm6, xmm6, xmm5
+ vpslldq xmm5, xmm6, 8
+ vpsrldq xmm6, xmm6, 8
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm1, xmm2, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm5, xmm0, xmm0, 0
+ vpclmulqdq xmm6, xmm0, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm2
+ vmovdqu xmm6, OWORD PTR [esp+96]
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_avx2_ghash_64
+L_AES_GCM_decrypt_avx2_ghash_64_inplace:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+112], xmm7
+ vmovdqu OWORD PTR [esp+128], xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [esp+144], xmm7
+ vmovdqu OWORD PTR [esp+160], xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [esp+112]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [esp+128]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [esp+144]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [esp+160]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx2_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_avx2_ghash_64_done
+L_AES_GCM_decrypt_avx2_ghash_64:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [ecx+32], xmm7
+ vmovdqu OWORD PTR [ecx+48], xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [ecx+16]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [ecx+32]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [ecx+48]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx2_ghash_64
+L_AES_GCM_decrypt_avx2_ghash_64_done:
+ vmovdqu xmm5, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+L_AES_GCM_decrypt_avx2_done_64:
+ cmp ebx, DWORD PTR [esp+216]
+ jge L_AES_GCM_decrypt_avx2_done_dec
+ mov eax, DWORD PTR [esp+216]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_avx2_last_block_done
+L_AES_GCM_decrypt_avx2_last_block_start:
+ vmovdqu xmm0, OWORD PTR [esi+ebx]
+ vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm4, xmm0, xmm6
+ ; aesenc_gfmul_sb
+ vpclmulqdq xmm2, xmm4, xmm5, 1
+ vpclmulqdq xmm3, xmm4, xmm5, 16
+ vpclmulqdq xmm1, xmm4, xmm5, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 17
+ vpxor xmm7, xmm7, [ebp]
+ vaesenc xmm7, xmm7, [ebp+16]
+ vpxor xmm3, xmm3, xmm2
+ vpslldq xmm2, xmm3, 8
+ vpsrldq xmm3, xmm3, 8
+ vaesenc xmm7, xmm7, [ebp+32]
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+48]
+ vaesenc xmm7, xmm7, [ebp+64]
+ vaesenc xmm7, xmm7, [ebp+80]
+ vpshufd xmm2, xmm2, 78
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+96]
+ vaesenc xmm7, xmm7, [ebp+112]
+ vaesenc xmm7, xmm7, [ebp+128]
+ vpshufd xmm2, xmm2, 78
+ vaesenc xmm7, xmm7, [ebp+144]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm2, xmm2, xmm4
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ cmp DWORD PTR [esp+236], 11
+ jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+176]
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ cmp DWORD PTR [esp+236], 13
+ jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
+ vaesenclast xmm7, xmm7, xmm0
+ vmovdqu xmm3, OWORD PTR [esi+ebx]
+ vpxor xmm6, xmm2, xmm1
+ vpxor xmm7, xmm7, xmm3
+ vmovdqu OWORD PTR [edi+ebx], xmm7
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_avx2_last_block_start
+L_AES_GCM_decrypt_avx2_last_block_done:
+ mov ecx, DWORD PTR [esp+216]
+ mov edx, DWORD PTR [esp+216]
+ and ecx, 15
+ jz L_AES_GCM_decrypt_avx2_done_dec
+ ; aesenc_last15_dec
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpxor xmm4, xmm4, [ebp]
+ vaesenc xmm4, xmm4, [ebp+16]
+ vaesenc xmm4, xmm4, [ebp+32]
+ vaesenc xmm4, xmm4, [ebp+48]
+ vaesenc xmm4, xmm4, [ebp+64]
+ vaesenc xmm4, xmm4, [ebp+80]
+ vaesenc xmm4, xmm4, [ebp+96]
+ vaesenc xmm4, xmm4, [ebp+112]
+ vaesenc xmm4, xmm4, [ebp+128]
+ vaesenc xmm4, xmm4, [ebp+144]
+ cmp DWORD PTR [esp+236], 11
+ vmovdqu xmm1, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm1
+ vaesenc xmm4, xmm4, [ebp+176]
+ cmp DWORD PTR [esp+236], 13
+ vmovdqu xmm1, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm1
+ vaesenc xmm4, xmm4, [ebp+208]
+ vmovdqu xmm1, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm1
+ xor ecx, ecx
+ vpxor xmm0, xmm0, xmm0
+ vmovdqu OWORD PTR [esp], xmm4
+ vmovdqu OWORD PTR [esp+16], xmm0
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
+ movzx eax, BYTE PTR [esi+ebx]
+ mov BYTE PTR [esp+ecx+16], al
+ xor al, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ebx], al
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
+ vmovdqu xmm4, OWORD PTR [esp+16]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm4
+ ; ghash_gfmul_red
+ vpclmulqdq xmm2, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 1
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm6, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm0
+L_AES_GCM_decrypt_avx2_done_dec:
+ vmovdqu xmm7, OWORD PTR [esp+80]
+ ; calc_tag
+ mov ecx, DWORD PTR [esp+216]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 0
+ mov ecx, DWORD PTR [esp+220]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov ecx, DWORD PTR [esp+216]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 1
+ mov ecx, DWORD PTR [esp+220]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm0, xmm0, xmm6
+ ; ghash_gfmul_red
+ vpclmulqdq xmm4, xmm0, xmm5, 16
+ vpclmulqdq xmm3, xmm0, xmm5, 1
+ vpclmulqdq xmm2, xmm0, xmm5, 0
+ vpxor xmm4, xmm4, xmm3
+ vpslldq xmm3, xmm4, 8
+ vpsrldq xmm4, xmm4, 8
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm0, xmm0, xmm5, 17
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm0, xmm0, xmm3
+ vpxor xmm0, xmm0, xmm2
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm0, xmm0, xmm7
+ mov edi, DWORD PTR [esp+212]
+ mov ebx, DWORD PTR [esp+228]
+ mov ebp, DWORD PTR [esp+240]
+ ; cmp_tag
+ cmp ebx, 16
+ je L_AES_GCM_decrypt_avx2_cmp_tag_16
+ xor edx, edx
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_avx2_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+edx]
+ xor al, BYTE PTR [edi+edx]
+ or cl, al
+ inc edx
+ cmp edx, ebx
+ jne L_AES_GCM_decrypt_avx2_cmp_tag_loop
+ cmp cl, 0
+ sete cl
+ jmp L_AES_GCM_decrypt_avx2_cmp_tag_done
+L_AES_GCM_decrypt_avx2_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [edi]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb edx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ecx, ecx
+ cmp edx, 65535
+ sete cl
+L_AES_GCM_decrypt_avx2_cmp_tag_done:
+ mov DWORD PTR [ebp], ecx
+ add esp, 176
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_avx2 ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_AESGCM_STREAM
+_TEXT SEGMENT READONLY PARA
+AES_GCM_init_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 32
+ mov ebp, DWORD PTR [esp+52]
+ mov esi, DWORD PTR [esp+60]
+ mov edi, DWORD PTR [esp+76]
+ vpxor xmm4, xmm4, xmm4
+ mov edx, DWORD PTR [esp+64]
+ cmp edx, 12
+ je L_AES_GCM_init_avx2_iv_12
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vaesenc xmm5, xmm5, [ebp+16]
+ vaesenc xmm5, xmm5, [ebp+32]
+ vaesenc xmm5, xmm5, [ebp+48]
+ vaesenc xmm5, xmm5, [ebp+64]
+ vaesenc xmm5, xmm5, [ebp+80]
+ vaesenc xmm5, xmm5, [ebp+96]
+ vaesenc xmm5, xmm5, [ebp+112]
+ vaesenc xmm5, xmm5, [ebp+128]
+ vaesenc xmm5, xmm5, [ebp+144]
+ cmp DWORD PTR [esp+56], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+176]
+ cmp DWORD PTR [esp+56], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm5, xmm5, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov ecx, 0
+ je L_AES_GCM_init_avx2_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_init_avx2_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_init_avx2_calc_iv_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm6, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm6, xmm2, 16
+ vpshufd xmm1, xmm6, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx2_calc_iv_16_loop
+ mov edx, DWORD PTR [esp+64]
+ cmp ecx, edx
+ je L_AES_GCM_init_avx2_calc_iv_done
+L_AES_GCM_init_avx2_calc_iv_lt16:
+ vpxor xmm0, xmm0, xmm0
+ xor ebx, ebx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_init_avx2_calc_iv_loop:
+ movzx eax, BYTE PTR [esi+ecx]
+ mov BYTE PTR [esp+ebx], al
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx2_calc_iv_loop
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm6, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm6, xmm2, 16
+ vpshufd xmm1, xmm6, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+L_AES_GCM_init_avx2_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vpinsrd xmm0, xmm0, edx, 0
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm6, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm6, xmm2, 16
+ vpshufd xmm1, xmm6, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ ; Encrypt counter
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vpxor xmm7, xmm7, xmm4
+ vaesenc xmm7, xmm7, [ebp+16]
+ vaesenc xmm7, xmm7, [ebp+32]
+ vaesenc xmm7, xmm7, [ebp+48]
+ vaesenc xmm7, xmm7, [ebp+64]
+ vaesenc xmm7, xmm7, [ebp+80]
+ vaesenc xmm7, xmm7, [ebp+96]
+ vaesenc xmm7, xmm7, [ebp+112]
+ vaesenc xmm7, xmm7, [ebp+128]
+ vaesenc xmm7, xmm7, [ebp+144]
+ cmp DWORD PTR [esp+56], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+176]
+ cmp DWORD PTR [esp+56], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm7, xmm7, xmm0
+ jmp L_AES_GCM_init_avx2_iv_done
+L_AES_GCM_init_avx2_iv_12:
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vpblendd xmm4, xmm4, [esi], 7
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqu xmm6, OWORD PTR [ebp+16]
+ vpxor xmm7, xmm4, xmm5
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm7, xmm7, xmm6
+ vmovdqu xmm0, OWORD PTR [ebp+32]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+48]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+64]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+80]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+96]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+112]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+128]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+144]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ cmp DWORD PTR [esp+56], 11
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ jl L_AES_GCM_init_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+176]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ cmp DWORD PTR [esp+56], 13
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ jl L_AES_GCM_init_avx2_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+208]
+ vaesenc xmm5, xmm5, xmm0
+ vaesenc xmm7, xmm7, xmm0
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_init_avx2_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm0
+ vaesenclast xmm7, xmm7, xmm0
+ vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask
+L_AES_GCM_init_avx2_iv_done:
+ vmovdqu OWORD PTR [edi], xmm7
+ mov ebp, DWORD PTR [esp+68]
+ mov edi, DWORD PTR [esp+72]
+ vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vmovdqu OWORD PTR [ebp], xmm5
+ vmovdqu OWORD PTR [edi], xmm4
+ add esp, 32
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_init_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_aad_update_avx2 PROC
+ push esi
+ push edi
+ mov esi, DWORD PTR [esp+12]
+ mov edx, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov eax, DWORD PTR [esp+24]
+ vmovdqu xmm4, OWORD PTR [edi]
+ vmovdqu xmm5, OWORD PTR [eax]
+ xor ecx, ecx
+L_AES_GCM_aad_update_avx2_16_loop:
+ vmovdqu xmm0, OWORD PTR [esi+ecx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm6, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm6, xmm2, 16
+ vpshufd xmm1, xmm6, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_aad_update_avx2_16_loop
+ vmovdqu OWORD PTR [edi], xmm4
+ pop edi
+ pop esi
+ ret
+AES_GCM_aad_update_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_block_avx2 PROC
+ push esi
+ push edi
+ mov ecx, DWORD PTR [esp+12]
+ mov eax, DWORD PTR [esp+16]
+ mov edi, DWORD PTR [esp+20]
+ mov esi, DWORD PTR [esp+24]
+ mov edx, DWORD PTR [esp+28]
+ vmovdqu xmm3, OWORD PTR [edx]
+ ; aesenc_block
+ vmovdqu xmm1, xmm3
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one
+ vpxor xmm0, xmm0, [ecx]
+ vaesenc xmm0, xmm0, [ecx+16]
+ vaesenc xmm0, xmm0, [ecx+32]
+ vaesenc xmm0, xmm0, [ecx+48]
+ vaesenc xmm0, xmm0, [ecx+64]
+ vaesenc xmm0, xmm0, [ecx+80]
+ vaesenc xmm0, xmm0, [ecx+96]
+ vaesenc xmm0, xmm0, [ecx+112]
+ vaesenc xmm0, xmm0, [ecx+128]
+ vaesenc xmm0, xmm0, [ecx+144]
+ cmp eax, 11
+ vmovdqu xmm2, OWORD PTR [ecx+160]
+ jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ecx+176]
+ cmp eax, 13
+ vmovdqu xmm2, OWORD PTR [ecx+192]
+ jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ecx+208]
+ vmovdqu xmm2, OWORD PTR [ecx+224]
+L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm2
+ vmovdqu xmm3, xmm1
+ vmovdqu xmm1, OWORD PTR [esi]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edi], xmm0
+ vmovdqu OWORD PTR [edx], xmm3
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_block_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_ghash_block_avx2 PROC
+ mov edx, DWORD PTR [esp+4]
+ mov eax, DWORD PTR [esp+8]
+ mov ecx, DWORD PTR [esp+12]
+ vmovdqu xmm4, OWORD PTR [eax]
+ vmovdqu xmm5, OWORD PTR [ecx]
+ vmovdqu xmm0, OWORD PTR [edx]
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpclmulqdq xmm2, xmm5, xmm4, 16
+ vpclmulqdq xmm1, xmm5, xmm4, 1
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm6, xmm0, xmm1
+ vpxor xmm4, xmm3, xmm2
+ ; ghash_mid
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm6, xmm2, 16
+ vpshufd xmm1, xmm6, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm4, xmm1
+ vmovdqu OWORD PTR [eax], xmm4
+ ret
+AES_GCM_ghash_block_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_update_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 96
+ mov esi, DWORD PTR [esp+144]
+ vmovdqu xmm4, OWORD PTR [esi]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ mov esi, DWORD PTR [esp+136]
+ mov ebp, DWORD PTR [esp+140]
+ vmovdqu xmm6, OWORD PTR [esi]
+ vmovdqu xmm5, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+80], xmm6
+ mov ebp, DWORD PTR [esp+116]
+ mov edi, DWORD PTR [esp+124]
+ mov esi, DWORD PTR [esp+128]
+ ; Calculate H
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+132], 64
+ mov eax, DWORD PTR [esp+132]
+ jl L_AES_GCM_encrypt_update_avx2_done_64
+ and eax, 4294967232
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vmovdqu OWORD PTR [esp+80], xmm6
+ vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ vmovdqu xmm2, xmm5
+ ; H ^ 2
+ vpclmulqdq xmm5, xmm2, xmm2, 0
+ vpclmulqdq xmm6, xmm2, xmm2, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red
+ vpclmulqdq xmm6, xmm2, xmm0, 16
+ vpclmulqdq xmm5, xmm2, xmm0, 1
+ vpclmulqdq xmm4, xmm2, xmm0, 0
+ vpxor xmm6, xmm6, xmm5
+ vpslldq xmm5, xmm6, 8
+ vpsrldq xmm6, xmm6, 8
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm1, xmm2, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm5, xmm0, xmm0, 0
+ vpclmulqdq xmm6, xmm0, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm2
+ vmovdqu xmm6, OWORD PTR [esp+80]
+ ; First 64 bytes of input
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [esi]
+ vmovdqu xmm4, OWORD PTR [esi+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edi], xmm0
+ vmovdqu OWORD PTR [edi+16], xmm1
+ vmovdqu xmm7, OWORD PTR [esi+32]
+ vmovdqu xmm4, OWORD PTR [esi+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [edi+32], xmm2
+ vmovdqu OWORD PTR [edi+48], xmm3
+ cmp eax, 64
+ mov ebx, 64
+ mov ecx, esi
+ mov edx, edi
+ jle L_AES_GCM_encrypt_update_avx2_end_64
+ ; More 64 bytes of input
+L_AES_GCM_encrypt_update_avx2_ghash_64:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+120], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [edx+-64]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [edx+-48]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [edx+-32]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [edx+-16]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_avx2_ghash_64
+L_AES_GCM_encrypt_update_avx2_end_64:
+ vmovdqu OWORD PTR [esp+80], xmm6
+ vmovdqu xmm3, OWORD PTR [edx+48]
+ vmovdqu xmm7, OWORD PTR [esp]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm5, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm4, xmm7, xmm3, 0
+ vpclmulqdq xmm6, xmm7, xmm3, 17
+ vpxor xmm5, xmm5, xmm1
+ vmovdqu xmm3, OWORD PTR [edx+32]
+ vmovdqu xmm7, OWORD PTR [esp+16]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vmovdqu xmm3, OWORD PTR [edx+16]
+ vmovdqu xmm7, OWORD PTR [esp+32]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vmovdqu xmm0, OWORD PTR [esp+80]
+ vmovdqu xmm3, OWORD PTR [edx]
+ vmovdqu xmm7, OWORD PTR [esp+48]
+ vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm3, xmm3, xmm0
+ vpclmulqdq xmm2, xmm7, xmm3, 16
+ vpclmulqdq xmm1, xmm7, xmm3, 1
+ vpclmulqdq xmm0, xmm7, xmm3, 0
+ vpclmulqdq xmm3, xmm7, xmm3, 17
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm4, xmm4, xmm0
+ vpslldq xmm7, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpxor xmm4, xmm4, xmm7
+ vpxor xmm6, xmm6, xmm5
+ ; ghash_red
+ vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpclmulqdq xmm0, xmm4, xmm2, 16
+ vpshufd xmm1, xmm4, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, xmm2, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm6, xmm6, xmm1
+ vmovdqu xmm5, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+L_AES_GCM_encrypt_update_avx2_done_64:
+ cmp ebx, DWORD PTR [esp+132]
+ je L_AES_GCM_encrypt_update_avx2_done_enc
+ mov eax, DWORD PTR [esp+132]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_avx2_last_block_done
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_block
+ vmovdqu xmm1, xmm4
+ vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one
+ vpxor xmm0, xmm0, [ebp]
+ vaesenc xmm0, xmm0, [ebp+16]
+ vaesenc xmm0, xmm0, [ebp+32]
+ vaesenc xmm0, xmm0, [ebp+48]
+ vaesenc xmm0, xmm0, [ebp+64]
+ vaesenc xmm0, xmm0, [ebp+80]
+ vaesenc xmm0, xmm0, [ebp+96]
+ vaesenc xmm0, xmm0, [ebp+112]
+ vaesenc xmm0, xmm0, [ebp+128]
+ vaesenc xmm0, xmm0, [ebp+144]
+ cmp DWORD PTR [esp+120], 11
+ vmovdqu xmm2, OWORD PTR [ebp+160]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ebp+176]
+ cmp DWORD PTR [esp+120], 13
+ vmovdqu xmm2, OWORD PTR [ebp+192]
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
+ vaesenc xmm0, xmm0, xmm2
+ vaesenc xmm0, xmm0, [ebp+208]
+ vmovdqu xmm2, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last:
+ vaesenclast xmm0, xmm0, xmm2
+ vmovdqu xmm4, xmm1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [edx], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm0
+ add ebx, 16
+ cmp ebx, eax
+ jge L_AES_GCM_encrypt_update_avx2_last_block_ghash
+L_AES_GCM_encrypt_update_avx2_last_block_start:
+ vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vmovdqu OWORD PTR [esp+64], xmm4
+ ; aesenc_gfmul_sb
+ vpclmulqdq xmm2, xmm6, xmm5, 1
+ vpclmulqdq xmm3, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 0
+ vpclmulqdq xmm4, xmm6, xmm5, 17
+ vpxor xmm7, xmm7, [ebp]
+ vaesenc xmm7, xmm7, [ebp+16]
+ vpxor xmm3, xmm3, xmm2
+ vpslldq xmm2, xmm3, 8
+ vpsrldq xmm3, xmm3, 8
+ vaesenc xmm7, xmm7, [ebp+32]
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+48]
+ vaesenc xmm7, xmm7, [ebp+64]
+ vaesenc xmm7, xmm7, [ebp+80]
+ vpshufd xmm2, xmm2, 78
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+96]
+ vaesenc xmm7, xmm7, [ebp+112]
+ vaesenc xmm7, xmm7, [ebp+128]
+ vpshufd xmm2, xmm2, 78
+ vaesenc xmm7, xmm7, [ebp+144]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm2, xmm2, xmm4
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ cmp DWORD PTR [esp+120], 11
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+176]
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ cmp DWORD PTR [esp+120], 13
+ jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
+ vaesenclast xmm7, xmm7, xmm0
+ vmovdqu xmm3, OWORD PTR [esi+ebx]
+ vpxor xmm6, xmm2, xmm1
+ vpxor xmm7, xmm7, xmm3
+ vmovdqu OWORD PTR [edi+ebx], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_encrypt_update_avx2_last_block_start
+L_AES_GCM_encrypt_update_avx2_last_block_ghash:
+ ; ghash_gfmul_red
+ vpclmulqdq xmm2, xmm6, xmm5, 16
+ vpclmulqdq xmm1, xmm6, xmm5, 1
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm2, xmm2, xmm1
+ vpslldq xmm1, xmm2, 8
+ vpsrldq xmm2, xmm2, 8
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm6, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm1, xmm1, xmm0
+ vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm1, xmm1, 78
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm6, xmm6, xmm0
+L_AES_GCM_encrypt_update_avx2_last_block_done:
+L_AES_GCM_encrypt_update_avx2_done_enc:
+ mov esi, DWORD PTR [esp+136]
+ mov edi, DWORD PTR [esp+144]
+ vmovdqu OWORD PTR [esi], xmm6
+ vmovdqu OWORD PTR [edi], xmm4
+ add esp, 96
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_encrypt_update_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_final_avx2 PROC
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+32]
+ mov esi, DWORD PTR [esp+52]
+ mov edi, DWORD PTR [esp+56]
+ vmovdqu xmm4, OWORD PTR [ebp]
+ vmovdqu xmm5, OWORD PTR [esi]
+ vmovdqu xmm6, OWORD PTR [edi]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ ; calc_tag
+ mov ecx, DWORD PTR [esp+44]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 0
+ mov ecx, DWORD PTR [esp+48]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov ecx, DWORD PTR [esp+44]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 1
+ mov ecx, DWORD PTR [esp+48]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_red
+ vpclmulqdq xmm7, xmm0, xmm5, 16
+ vpclmulqdq xmm3, xmm0, xmm5, 1
+ vpclmulqdq xmm2, xmm0, xmm5, 0
+ vpxor xmm7, xmm7, xmm3
+ vpslldq xmm3, xmm7, 8
+ vpsrldq xmm7, xmm7, 8
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm0, xmm0, xmm5, 17
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm0, xmm0, xmm3
+ vpxor xmm0, xmm0, xmm2
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm0, xmm0, xmm6
+ mov edi, DWORD PTR [esp+36]
+ ; store_tag
+ cmp DWORD PTR [esp+40], 16
+ je L_AES_GCM_encrypt_final_avx2_store_tag_16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_encrypt_final_avx2_store_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ mov BYTE PTR [edi+ecx], al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+40]
+ jne L_AES_GCM_encrypt_final_avx2_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_avx2_store_tag_done
+L_AES_GCM_encrypt_final_avx2_store_tag_16:
+ vmovdqu OWORD PTR [edi], xmm0
+L_AES_GCM_encrypt_final_avx2_store_tag_done:
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ ret
+AES_GCM_encrypt_final_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_update_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 160
+ mov esi, DWORD PTR [esp+208]
+ vmovdqu xmm4, OWORD PTR [esi]
+ mov esi, DWORD PTR [esp+200]
+ mov ebp, DWORD PTR [esp+204]
+ vmovdqu xmm6, OWORD PTR [esi]
+ vmovdqu xmm5, OWORD PTR [ebp]
+ mov ebp, DWORD PTR [esp+180]
+ mov edi, DWORD PTR [esp+188]
+ mov esi, DWORD PTR [esp+192]
+ ; Calculate H
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ xor ebx, ebx
+ cmp DWORD PTR [esp+196], 64
+ mov eax, DWORD PTR [esp+196]
+ jl L_AES_GCM_decrypt_update_avx2_done_64
+ and eax, 4294967232
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vmovdqu OWORD PTR [esp+80], xmm6
+ vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128
+ ; H ^ 1
+ vmovdqu OWORD PTR [esp], xmm5
+ vmovdqu xmm2, xmm5
+ ; H ^ 2
+ vpclmulqdq xmm5, xmm2, xmm2, 0
+ vpclmulqdq xmm6, xmm2, xmm2, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm0, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red
+ vpclmulqdq xmm6, xmm2, xmm0, 16
+ vpclmulqdq xmm5, xmm2, xmm0, 1
+ vpclmulqdq xmm4, xmm2, xmm0, 0
+ vpxor xmm6, xmm6, xmm5
+ vpslldq xmm5, xmm6, 8
+ vpsrldq xmm6, xmm6, 8
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm1, xmm2, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm5, xmm0, xmm0, 0
+ vpclmulqdq xmm6, xmm0, xmm0, 17
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpclmulqdq xmm4, xmm5, xmm3, 16
+ vpshufd xmm5, xmm5, 78
+ vpxor xmm5, xmm5, xmm4
+ vpxor xmm2, xmm6, xmm5
+ vmovdqu OWORD PTR [esp+48], xmm2
+ vmovdqu xmm6, OWORD PTR [esp+80]
+ cmp edi, esi
+ jne L_AES_GCM_decrypt_update_avx2_ghash_64
+L_AES_GCM_decrypt_update_avx2_ghash_64_inplace:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [esp+96], xmm7
+ vmovdqu OWORD PTR [esp+112], xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [esp+128], xmm7
+ vmovdqu OWORD PTR [esp+144], xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [esp+96]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [esp+112]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [esp+128]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [esp+144]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx2_ghash_64_inplace
+ jmp L_AES_GCM_decrypt_update_avx2_ghash_64_done
+L_AES_GCM_decrypt_update_avx2_ghash_64:
+ ; aesenc_64_ghash
+ lea ecx, DWORD PTR [esi+ebx]
+ lea edx, DWORD PTR [edi+ebx]
+ ; aesenc_64
+ ; aesenc_ctr
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vpshufb xmm0, xmm4, xmm7
+ vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two
+ vpshufb xmm1, xmm1, xmm7
+ vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three
+ vpshufb xmm2, xmm2, xmm7
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four
+ vpshufb xmm3, xmm3, xmm7
+ ; aesenc_xor
+ vmovdqu xmm7, OWORD PTR [ebp]
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm7
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+16]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+32]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+48]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+64]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+80]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+96]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+112]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+128]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+144]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 11
+ vmovdqu xmm7, OWORD PTR [ebp+160]
+ jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+176]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ cmp DWORD PTR [esp+184], 13
+ vmovdqu xmm7, OWORD PTR [ebp+192]
+ jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+208]
+ vaesenc xmm0, xmm0, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vaesenc xmm2, xmm2, xmm7
+ vaesenc xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+ ; aesenc_last
+ vaesenclast xmm0, xmm0, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vaesenclast xmm2, xmm2, xmm7
+ vaesenclast xmm3, xmm3, xmm7
+ vmovdqu xmm7, OWORD PTR [ecx]
+ vmovdqu xmm4, OWORD PTR [ecx+16]
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm1, xmm1, xmm4
+ vmovdqu OWORD PTR [edx], xmm0
+ vmovdqu OWORD PTR [edx+16], xmm1
+ vmovdqu xmm7, OWORD PTR [ecx+32]
+ vmovdqu xmm4, OWORD PTR [ecx+48]
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [ecx+32], xmm7
+ vmovdqu OWORD PTR [ecx+48], xmm4
+ vmovdqu OWORD PTR [edx+32], xmm2
+ vmovdqu OWORD PTR [edx+48], xmm3
+ ; pclmul_1
+ vmovdqu xmm1, OWORD PTR [ecx]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vmovdqu xmm2, OWORD PTR [esp+48]
+ vpxor xmm1, xmm1, xmm6
+ vpclmulqdq xmm5, xmm1, xmm2, 16
+ vpclmulqdq xmm3, xmm1, xmm2, 1
+ vpclmulqdq xmm6, xmm1, xmm2, 0
+ vpclmulqdq xmm7, xmm1, xmm2, 17
+ ; pclmul_2
+ vmovdqu xmm1, OWORD PTR [ecx+16]
+ vmovdqu xmm0, OWORD PTR [esp+32]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [ecx+32]
+ vmovdqu xmm0, OWORD PTR [esp+16]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; pclmul_n
+ vmovdqu xmm1, OWORD PTR [ecx+48]
+ vmovdqu xmm0, OWORD PTR [esp]
+ vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm5, xmm5, xmm2
+ vpclmulqdq xmm2, xmm1, xmm0, 16
+ vpxor xmm5, xmm5, xmm3
+ vpclmulqdq xmm3, xmm1, xmm0, 1
+ vpxor xmm6, xmm6, xmm4
+ vpclmulqdq xmm4, xmm1, xmm0, 0
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vpxor xmm7, xmm7, xmm1
+ ; aesenc_pclmul_l
+ vpxor xmm5, xmm5, xmm2
+ vpxor xmm6, xmm6, xmm4
+ vpxor xmm5, xmm5, xmm3
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm6, xmm6, xmm1
+ vpxor xmm7, xmm7, xmm5
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpclmulqdq xmm3, xmm6, xmm0, 16
+ vpshufd xmm6, xmm6, 78
+ vpxor xmm6, xmm6, xmm3
+ vpxor xmm6, xmm6, xmm7
+ ; aesenc_64_ghash - end
+ add ebx, 64
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx2_ghash_64
+L_AES_GCM_decrypt_update_avx2_ghash_64_done:
+ vmovdqu xmm5, OWORD PTR [esp]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+L_AES_GCM_decrypt_update_avx2_done_64:
+ cmp ebx, DWORD PTR [esp+196]
+ jge L_AES_GCM_decrypt_update_avx2_done_dec
+ mov eax, DWORD PTR [esp+196]
+ and eax, 4294967280
+ cmp ebx, eax
+ jge L_AES_GCM_decrypt_update_avx2_last_block_done
+L_AES_GCM_decrypt_update_avx2_last_block_start:
+ vmovdqu xmm0, OWORD PTR [esi+ebx]
+ vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one
+ vmovdqu OWORD PTR [esp+64], xmm4
+ vpxor xmm4, xmm0, xmm6
+ ; aesenc_gfmul_sb
+ vpclmulqdq xmm2, xmm4, xmm5, 1
+ vpclmulqdq xmm3, xmm4, xmm5, 16
+ vpclmulqdq xmm1, xmm4, xmm5, 0
+ vpclmulqdq xmm4, xmm4, xmm5, 17
+ vpxor xmm7, xmm7, [ebp]
+ vaesenc xmm7, xmm7, [ebp+16]
+ vpxor xmm3, xmm3, xmm2
+ vpslldq xmm2, xmm3, 8
+ vpsrldq xmm3, xmm3, 8
+ vaesenc xmm7, xmm7, [ebp+32]
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+48]
+ vaesenc xmm7, xmm7, [ebp+64]
+ vaesenc xmm7, xmm7, [ebp+80]
+ vpshufd xmm2, xmm2, 78
+ vpxor xmm2, xmm2, xmm1
+ vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vaesenc xmm7, xmm7, [ebp+96]
+ vaesenc xmm7, xmm7, [ebp+112]
+ vaesenc xmm7, xmm7, [ebp+128]
+ vpshufd xmm2, xmm2, 78
+ vaesenc xmm7, xmm7, [ebp+144]
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm2, xmm2, xmm4
+ vmovdqu xmm0, OWORD PTR [ebp+160]
+ cmp DWORD PTR [esp+184], 11
+ jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+176]
+ vmovdqu xmm0, OWORD PTR [ebp+192]
+ cmp DWORD PTR [esp+184], 13
+ jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+ vaesenc xmm7, xmm7, xmm0
+ vaesenc xmm7, xmm7, [ebp+208]
+ vmovdqu xmm0, OWORD PTR [ebp+224]
+L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
+ vaesenclast xmm7, xmm7, xmm0
+ vmovdqu xmm3, OWORD PTR [esi+ebx]
+ vpxor xmm6, xmm2, xmm1
+ vpxor xmm7, xmm7, xmm3
+ vmovdqu OWORD PTR [edi+ebx], xmm7
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ add ebx, 16
+ cmp ebx, eax
+ jl L_AES_GCM_decrypt_update_avx2_last_block_start
+L_AES_GCM_decrypt_update_avx2_last_block_done:
+L_AES_GCM_decrypt_update_avx2_done_dec:
+ mov esi, DWORD PTR [esp+200]
+ mov edi, DWORD PTR [esp+208]
+ vmovdqu xmm4, OWORD PTR [esp+64]
+ vmovdqu OWORD PTR [esi], xmm6
+ vmovdqu OWORD PTR [edi], xmm4
+ add esp, 160
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_update_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_final_avx2 PROC
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16
+ mov ebp, DWORD PTR [esp+36]
+ mov esi, DWORD PTR [esp+56]
+ mov edi, DWORD PTR [esp+60]
+ vmovdqu xmm4, OWORD PTR [ebp]
+ vmovdqu xmm5, OWORD PTR [esi]
+ vmovdqu xmm6, OWORD PTR [edi]
+ vpsrlq xmm1, xmm5, 63
+ vpsllq xmm0, xmm5, 1
+ vpslldq xmm1, xmm1, 8
+ vpor xmm0, xmm0, xmm1
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128
+ vpxor xmm5, xmm5, xmm0
+ ; calc_tag
+ mov ecx, DWORD PTR [esp+48]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 0
+ mov ecx, DWORD PTR [esp+52]
+ shl ecx, 3
+ vpinsrd xmm0, xmm0, ecx, 2
+ mov ecx, DWORD PTR [esp+48]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 1
+ mov ecx, DWORD PTR [esp+52]
+ shr ecx, 29
+ vpinsrd xmm0, xmm0, ecx, 3
+ vpxor xmm0, xmm0, xmm4
+ ; ghash_gfmul_red
+ vpclmulqdq xmm7, xmm0, xmm5, 16
+ vpclmulqdq xmm3, xmm0, xmm5, 1
+ vpclmulqdq xmm2, xmm0, xmm5, 0
+ vpxor xmm7, xmm7, xmm3
+ vpslldq xmm3, xmm7, 8
+ vpsrldq xmm7, xmm7, 8
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm0, xmm0, xmm5, 17
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm3, xmm3, xmm2
+ vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16
+ vpshufd xmm3, xmm3, 78
+ vpxor xmm0, xmm0, xmm7
+ vpxor xmm0, xmm0, xmm3
+ vpxor xmm0, xmm0, xmm2
+ vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask
+ vpxor xmm0, xmm0, xmm6
+ mov esi, DWORD PTR [esp+40]
+ mov edi, DWORD PTR [esp+64]
+ ; cmp_tag
+ cmp DWORD PTR [esp+44], 16
+ je L_AES_GCM_decrypt_final_avx2_cmp_tag_16
+ xor ecx, ecx
+ xor edx, edx
+ vmovdqu OWORD PTR [esp], xmm0
+L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
+ movzx eax, BYTE PTR [esp+ecx]
+ xor al, BYTE PTR [esi+ecx]
+ or dl, al
+ inc ecx
+ cmp ecx, DWORD PTR [esp+44]
+ jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
+ cmp dl, 0
+ sete dl
+ jmp L_AES_GCM_decrypt_final_avx2_cmp_tag_done
+L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [esi]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb ecx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor edx, edx
+ cmp ecx, 65535
+ sete dl
+L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
+ mov DWORD PTR [edi], edx
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+AES_GCM_decrypt_final_avx2 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfcrypt/src/aes_x86_64_asm.S b/wolfcrypt/src/aes_x86_64_asm.S
index 9eb85b49c73..e2c9c318e04 100644
--- a/wolfcrypt/src/aes_x86_64_asm.S
+++ b/wolfcrypt/src/aes_x86_64_asm.S
@@ -1141,7 +1141,7 @@ AES_CTR_encrypt_AESNI:
.p2align 4
_AES_CTR_encrypt_AESNI:
#endif /* __APPLE__ */
- pushq %rbx
+ pushq %r12
movdqu L_aes_ctr_aesni_bswap(%rip), %xmm8
movdqu L_aes_ctr_aesni_one(%rip), %xmm9
pxor %xmm10, %xmm10
@@ -1156,7 +1156,7 @@ L_AES_CTR_encrypt_AESNI_enc_64:
# 64 bytes of input
# aes_ctr_enc_64
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
movdqa %xmm7, %xmm0
pshufb %xmm8, %xmm0
paddq %xmm9, %xmm7
@@ -1278,10 +1278,10 @@ L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last:
pxor %xmm4, %xmm2
movdqu 48(%r11), %xmm4
pxor %xmm4, %xmm3
- movdqu %xmm0, (%rbx)
- movdqu %xmm1, 16(%rbx)
- movdqu %xmm2, 32(%rbx)
- movdqu %xmm3, 48(%rbx)
+ movdqu %xmm0, (%r12)
+ movdqu %xmm1, 16(%r12)
+ movdqu %xmm2, 32(%r12)
+ movdqu %xmm3, 48(%r12)
addl $0x40, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_AESNI_enc_64
@@ -1346,7 +1346,7 @@ L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last:
L_AES_CTR_encrypt_AESNI_done_enc:
pshufb %xmm8, %xmm7
movdqu %xmm7, (%r9)
- popq %rbx
+ popq %r12
repz retq
#ifndef __APPLE__
.size AES_CTR_encrypt_AESNI,.-AES_CTR_encrypt_AESNI
@@ -1972,7 +1972,7 @@ AES_CTR_encrypt_avx1:
.p2align 4
_AES_CTR_encrypt_avx1:
#endif /* __APPLE__ */
- pushq %rbx
+ pushq %r12
vmovdqu L_aes_ctr_avx1_bswap(%rip), %xmm8
vmovdqu L_aes_ctr_avx1_one(%rip), %xmm9
vpxor %xmm10, %xmm10, %xmm10
@@ -1987,7 +1987,7 @@ L_AES_CTR_encrypt_avx1_enc_64:
# 64 bytes of input
# aes_ctr_enc_64
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
vpshufb %xmm8, %xmm7, %xmm0
vpaddq %xmm9, %xmm7, %xmm7
vpcmpeqq %xmm10, %xmm7, %xmm11
@@ -2097,10 +2097,10 @@ L_AES_CTR_encrypt_avx1_64_aes_enc_block_last:
vpxor 16(%r11), %xmm1, %xmm1
vpxor 32(%r11), %xmm2, %xmm2
vpxor 48(%r11), %xmm3, %xmm3
- vmovdqu %xmm0, (%rbx)
- vmovdqu %xmm1, 16(%rbx)
- vmovdqu %xmm2, 32(%rbx)
- vmovdqu %xmm3, 48(%rbx)
+ vmovdqu %xmm0, (%r12)
+ vmovdqu %xmm1, 16(%r12)
+ vmovdqu %xmm2, 32(%r12)
+ vmovdqu %xmm3, 48(%r12)
addl $0x40, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_avx1_enc_64
@@ -2162,7 +2162,7 @@ L_AES_CTR_encrypt_avx1_16_aes_enc_block_last:
L_AES_CTR_encrypt_avx1_done_enc:
vpshufb %xmm8, %xmm7, %xmm7
vmovdqu %xmm7, (%r9)
- popq %rbx
+ popq %r12
repz retq
#ifndef __APPLE__
.size AES_CTR_encrypt_avx1,.-AES_CTR_encrypt_avx1
@@ -2965,7 +2965,7 @@ AES_CTR_encrypt_vaes:
.p2align 4
_AES_CTR_encrypt_vaes:
#endif /* __APPLE__ */
- pushq %rbx
+ pushq %r12
vbroadcasti128 L_aes_ctr_bswap_vaes(%rip), %ymm8
vbroadcasti128 (%r9), %ymm7
vpshufb %ymm8, %ymm7, %ymm7
@@ -3016,7 +3016,7 @@ _AES_CTR_encrypt_vaes:
L_AES_CTR_encrypt_vaes_enc_128:
# 128 bytes of input
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
vpshufb %ymm8, %ymm4, %ymm0
vpshufb %ymm8, %ymm5, %ymm1
vpshufb %ymm8, %ymm6, %ymm2
@@ -3142,10 +3142,10 @@ L_AES_CTR_encrypt_vaes_128_aes_enc_block_last:
vpxor 32(%r11), %ymm1, %ymm1
vpxor 64(%r11), %ymm2, %ymm2
vpxor 96(%r11), %ymm3, %ymm3
- vmovdqu %ymm0, (%rbx)
- vmovdqu %ymm1, 32(%rbx)
- vmovdqu %ymm2, 64(%rbx)
- vmovdqu %ymm3, 96(%rbx)
+ vmovdqu %ymm0, (%r12)
+ vmovdqu %ymm1, 32(%r12)
+ vmovdqu %ymm2, 64(%r12)
+ vmovdqu %ymm3, 96(%r12)
addl $0x80, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_vaes_enc_128
@@ -3159,7 +3159,7 @@ L_AES_CTR_encrypt_vaes_enc_32:
# 32 bytes of input
# aes_ctr_enc_32
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm0
vmovdqa %ymm7, %ymm9
vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
@@ -3216,7 +3216,7 @@ L_AES_CTR_encrypt_vaes_enc_32:
L_AES_CTR_encrypt_vaes_32_aes_enc_block_last:
vaesenclast %ymm13, %ymm0, %ymm0
vpxor (%r11), %ymm0, %ymm0
- vmovdqu %ymm0, (%rbx)
+ vmovdqu %ymm0, (%r12)
addl $32, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_vaes_enc_32
@@ -3282,7 +3282,7 @@ L_AES_CTR_encrypt_vaes_16_aes_enc_block_last:
L_AES_CTR_encrypt_vaes_done_enc:
vpshufb %xmm8, %xmm7, %xmm0
vmovdqu %xmm0, (%r9)
- popq %rbx
+ popq %r12
repz retq
#ifndef __APPLE__
.size AES_CTR_encrypt_vaes,.-AES_CTR_encrypt_vaes
@@ -4080,7 +4080,7 @@ AES_CTR_encrypt_avx512:
.p2align 4
_AES_CTR_encrypt_avx512:
#endif /* __APPLE__ */
- pushq %rbx
+ pushq %r12
vbroadcasti32x4 L_aes_ctr_bswap_avx512(%rip), %zmm8
vbroadcasti32x4 (%r9), %zmm7
vpshufb %zmm8, %zmm7, %zmm7
@@ -4141,7 +4141,7 @@ L_AES_CTR_encrypt_avx512_key_cached:
L_AES_CTR_encrypt_avx512_enc_256:
# 256 bytes of input
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
vpshufb %zmm8, %zmm4, %zmm0
vpshufb %zmm8, %zmm5, %zmm1
vpshufb %zmm8, %zmm6, %zmm2
@@ -4243,10 +4243,10 @@ L_AES_CTR_encrypt_avx512_256_aes_enc_block_last:
vpxorq 64(%r11), %zmm1, %zmm1
vpxorq 128(%r11), %zmm2, %zmm2
vpxorq 192(%r11), %zmm3, %zmm3
- vmovdqu64 %zmm0, (%rbx)
- vmovdqu64 %zmm1, 64(%rbx)
- vmovdqu64 %zmm2, 128(%rbx)
- vmovdqu64 %zmm3, 192(%rbx)
+ vmovdqu64 %zmm0, (%r12)
+ vmovdqu64 %zmm1, 64(%r12)
+ vmovdqu64 %zmm2, 128(%r12)
+ vmovdqu64 %zmm3, 192(%r12)
addl $0x100, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_avx512_enc_256
@@ -4260,7 +4260,7 @@ L_AES_CTR_encrypt_avx512_enc_64:
# 64 bytes of input
# aes_ctr_enc_64
leaq (%rdi,%rax,1), %r11
- leaq (%rsi,%rax,1), %rbx
+ leaq (%rsi,%rax,1), %r12
vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm0
vmovdqa64 %zmm7, %zmm9
vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm0, %zmm9
@@ -4299,7 +4299,7 @@ L_AES_CTR_encrypt_avx512_enc_64:
L_AES_CTR_encrypt_avx512_64_aes_enc_block_last:
vaesenclast %zmm13, %zmm0, %zmm0
vpxorq (%r11), %zmm0, %zmm0
- vmovdqu64 %zmm0, (%rbx)
+ vmovdqu64 %zmm0, (%r12)
addl $0x40, %eax
cmpl %r10d, %eax
jl L_AES_CTR_encrypt_avx512_enc_64
@@ -4362,7 +4362,7 @@ L_AES_CTR_encrypt_avx512_16_aes_enc_block_last:
L_AES_CTR_encrypt_avx512_done_enc:
vpshufb %xmm8, %xmm7, %xmm0
vmovdqu %xmm0, (%r9)
- popq %rbx
+ popq %r12
repz retq
#ifndef __APPLE__
.size AES_CTR_encrypt_avx512,.-AES_CTR_encrypt_avx512
diff --git a/wolfcrypt/src/aes_x86_64_asm.asm b/wolfcrypt/src/aes_x86_64_asm.asm
index 26ccbb5ee8e..aacbd440da3 100644
--- a/wolfcrypt/src/aes_x86_64_asm.asm
+++ b/wolfcrypt/src/aes_x86_64_asm.asm
@@ -470,23 +470,25 @@ AES_256_Key_Expansion_AESNI ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_ECB_encrypt_AESNI PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 16
movdqu OWORD PTR [rsp], xmm6
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_encrypt_AESNI_done_64
- and r9d, 4294967232
+ and r11d, 4294967232
L_AES_ECB_encrypt_AESNI_enc_64:
; 64 bytes of input
; aes_ecb_enc_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- movdqu xmm0, OWORD PTR [r10]
- movdqu xmm1, OWORD PTR [r10+16]
- movdqu xmm2, OWORD PTR [r10+32]
- movdqu xmm3, OWORD PTR [r10+48]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ movdqu xmm0, OWORD PTR [r12]
+ movdqu xmm1, OWORD PTR [r12+16]
+ movdqu xmm2, OWORD PTR [r12+32]
+ movdqu xmm3, OWORD PTR [r12+48]
; aes_enc_block
movdqu xmm4, OWORD PTR [r9]
pxor xmm0, xmm4
@@ -568,22 +570,22 @@ L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last:
aesenclast xmm1, xmm4
aesenclast xmm2, xmm4
aesenclast xmm3, xmm4
- movdqu OWORD PTR [r11], xmm0
- movdqu OWORD PTR [r11+16], xmm1
- movdqu OWORD PTR [r11+32], xmm2
- movdqu OWORD PTR [r11+48], xmm3
- add eax, 64
- cmp eax, r9d
+ movdqu OWORD PTR [r13], xmm0
+ movdqu OWORD PTR [r13+16], xmm1
+ movdqu OWORD PTR [r13+32], xmm2
+ movdqu OWORD PTR [r13+48], xmm3
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_AESNI_enc_64
L_AES_ECB_encrypt_AESNI_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_encrypt_AESNI_done_enc
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_encrypt_AESNI_enc_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- movdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ movdqu xmm0, OWORD PTR [r12]
; aes_enc_block
pxor xmm0, [r9]
movdqu xmm5, OWORD PTR [r9+16]
@@ -619,36 +621,40 @@ L_AES_ECB_encrypt_AESNI_enc_16:
movdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last:
aesenclast xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- movdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ movdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_AESNI_enc_16
L_AES_ECB_encrypt_AESNI_done_enc:
movdqu xmm6, OWORD PTR [rsp]
add rsp, 16
+ pop r13
+ pop r12
ret
AES_ECB_encrypt_AESNI ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_ECB_decrypt_AESNI PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 16
movdqu OWORD PTR [rsp], xmm6
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_decrypt_AESNI_done_64
- and r9d, 4294967232
+ and r11d, 4294967232
L_AES_ECB_decrypt_AESNI_dec_64:
; 64 bytes of input
; aes_ecb_dec_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- movdqu xmm0, OWORD PTR [r10]
- movdqu xmm1, OWORD PTR [r10+16]
- movdqu xmm2, OWORD PTR [r10+32]
- movdqu xmm3, OWORD PTR [r10+48]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ movdqu xmm0, OWORD PTR [r12]
+ movdqu xmm1, OWORD PTR [r12+16]
+ movdqu xmm2, OWORD PTR [r12+32]
+ movdqu xmm3, OWORD PTR [r12+48]
; aes_dec_block
movdqu xmm4, OWORD PTR [r9]
pxor xmm0, xmm4
@@ -730,22 +736,22 @@ L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last:
aesdeclast xmm1, xmm4
aesdeclast xmm2, xmm4
aesdeclast xmm3, xmm4
- movdqu OWORD PTR [r11], xmm0
- movdqu OWORD PTR [r11+16], xmm1
- movdqu OWORD PTR [r11+32], xmm2
- movdqu OWORD PTR [r11+48], xmm3
- add eax, 64
- cmp eax, r9d
+ movdqu OWORD PTR [r13], xmm0
+ movdqu OWORD PTR [r13+16], xmm1
+ movdqu OWORD PTR [r13+32], xmm2
+ movdqu OWORD PTR [r13+48], xmm3
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_AESNI_dec_64
L_AES_ECB_decrypt_AESNI_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_decrypt_AESNI_done_dec
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_decrypt_AESNI_dec_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- movdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ movdqu xmm0, OWORD PTR [r12]
; aes_dec_block
pxor xmm0, [r9]
movdqu xmm5, OWORD PTR [r9+16]
@@ -781,29 +787,33 @@ L_AES_ECB_decrypt_AESNI_dec_16:
movdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last:
aesdeclast xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- movdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ movdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_AESNI_dec_16
L_AES_ECB_decrypt_AESNI_done_dec:
movdqu xmm6, OWORD PTR [rsp]
add rsp, 16
+ pop r13
+ pop r12
ret
AES_ECB_decrypt_AESNI ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_encrypt_AESNI PROC
- mov rax, QWORD PTR [rsp+40]
- mov r10d, DWORD PTR [rsp+48]
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ mov r10d, DWORD PTR [rsp+64]
movdqu xmm0, OWORD PTR [r8]
- xor eax, eax
- cmp eax, r9d
+ xor r11d, r11d
+ cmp r11d, r9d
je L_AES_CBC_encrypt_AESNI_done
L_AES_CBC_encrypt_AESNI_loop:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- movdqu xmm1, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r11]
+ movdqu xmm1, OWORD PTR [r12]
pxor xmm1, xmm0
; aes_enc_block
pxor xmm1, [rax]
@@ -840,41 +850,45 @@ L_AES_CBC_encrypt_AESNI_loop:
movdqu xmm3, OWORD PTR [rax+224]
L_AES_CBC_encrypt_AESNI_aes_enc_block_last:
aesenclast xmm1, xmm3
- lea r11, QWORD PTR [rdx+rax]
- movdqu OWORD PTR [r11], xmm1
+ lea r13, QWORD PTR [rdx+r11]
+ movdqu OWORD PTR [r13], xmm1
movdqa xmm0, xmm1
- add eax, 16
- cmp eax, r9d
+ add r11d, 16
+ cmp r11d, r9d
jl L_AES_CBC_encrypt_AESNI_loop
L_AES_CBC_encrypt_AESNI_done:
movdqu OWORD PTR [r8], xmm0
+ pop r13
+ pop r12
ret
AES_CBC_encrypt_AESNI ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_decrypt_AESNI PROC
push r12
- mov rax, QWORD PTR [rsp+48]
- mov r10d, DWORD PTR [rsp+56]
+ push r13
+ push r14
+ mov rax, QWORD PTR [rsp+64]
+ mov r10d, DWORD PTR [rsp+72]
sub rsp, 48
movdqu OWORD PTR [rsp], xmm6
movdqu OWORD PTR [rsp+16], xmm7
movdqu OWORD PTR [rsp+32], xmm8
movdqu xmm4, OWORD PTR [r8]
- xor eax, eax
+ xor r11d, r11d
cmp r9d, 64
- mov r10d, r9d
+ mov r12d, r9d
jl L_AES_CBC_decrypt_AESNI_done_64
- and r10d, 4294967232
+ and r12d, 4294967232
L_AES_CBC_decrypt_AESNI_dec_64:
; 64 bytes of input
; aes_cbc_dec_64
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- movdqu xmm0, OWORD PTR [r11]
- movdqu xmm1, OWORD PTR [r11+16]
- movdqu xmm2, OWORD PTR [r11+32]
- movdqu xmm3, OWORD PTR [r11+48]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ movdqu xmm0, OWORD PTR [r13]
+ movdqu xmm1, OWORD PTR [r13+16]
+ movdqu xmm2, OWORD PTR [r13+32]
+ movdqu xmm3, OWORD PTR [r13+48]
; aes_dec_block
movdqu xmm5, OWORD PTR [rax]
pxor xmm0, xmm5
@@ -957,29 +971,29 @@ L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last:
aesdeclast xmm2, xmm5
aesdeclast xmm3, xmm5
pxor xmm0, xmm4
- movdqu xmm5, OWORD PTR [r11]
+ movdqu xmm5, OWORD PTR [r13]
pxor xmm1, xmm5
- movdqu xmm5, OWORD PTR [r11+16]
+ movdqu xmm5, OWORD PTR [r13+16]
pxor xmm2, xmm5
- movdqu xmm5, OWORD PTR [r11+32]
+ movdqu xmm5, OWORD PTR [r13+32]
pxor xmm3, xmm5
- movdqu xmm4, OWORD PTR [r11+48]
- movdqu OWORD PTR [r12], xmm0
- movdqu OWORD PTR [r12+16], xmm1
- movdqu OWORD PTR [r12+32], xmm2
- movdqu OWORD PTR [r12+48], xmm3
- add eax, 64
- cmp eax, r10d
+ movdqu xmm4, OWORD PTR [r13+48]
+ movdqu OWORD PTR [r14], xmm0
+ movdqu OWORD PTR [r14+16], xmm1
+ movdqu OWORD PTR [r14+32], xmm2
+ movdqu OWORD PTR [r14+48], xmm3
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_AESNI_dec_64
L_AES_CBC_decrypt_AESNI_done_64:
- cmp eax, r9d
- mov r10d, r9d
+ cmp r11d, r9d
+ mov r12d, r9d
je L_AES_CBC_decrypt_AESNI_done_dec
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CBC_decrypt_AESNI_dec_16:
; 16 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- movdqu xmm0, OWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ movdqu xmm0, OWORD PTR [r13]
movdqa xmm8, xmm0
; aes_dec_block
pxor xmm0, [rax]
@@ -1018,10 +1032,10 @@ L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last:
aesdeclast xmm0, xmm6
pxor xmm0, xmm4
movdqa xmm4, xmm8
- lea r11, QWORD PTR [rdx+rax]
- movdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rdx+r11]
+ movdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_AESNI_dec_16
L_AES_CBC_decrypt_AESNI_done_dec:
movdqu OWORD PTR [r8], xmm4
@@ -1029,27 +1043,29 @@ L_AES_CBC_decrypt_AESNI_done_dec:
movdqu xmm7, OWORD PTR [rsp+16]
movdqu xmm8, OWORD PTR [rsp+32]
add rsp, 48
+ pop r14
+ pop r13
pop r12
ret
AES_CBC_decrypt_AESNI ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_aesni_bswap QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_aes_ctr_aesni_bswap QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_aes_ctr_aesni_bswap QWORD L_aes_ctr_aesni_bswap
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_aesni_one QWORD \
- 0000000000000001h, 0000000000000000h
+L_aes_ctr_aesni_one QWORD 0000000000000001h, 0000000000000000h
ptr_L_aes_ctr_aesni_one QWORD L_aes_ctr_aesni_one
_DATA ENDS
_TEXT SEGMENT READONLY PARA
AES_CTR_encrypt_AESNI PROC
- push rbx
- mov eax, DWORD PTR [rsp+48]
- mov r10, QWORD PTR [rsp+56]
+ push r12
+ push r13
+ push r14
+ mov eax, DWORD PTR [rsp+64]
+ mov r10, QWORD PTR [rsp+72]
sub rsp, 96
movdqu OWORD PTR [rsp], xmm6
movdqu OWORD PTR [rsp+16], xmm7
@@ -1062,16 +1078,16 @@ AES_CTR_encrypt_AESNI PROC
pxor xmm10, xmm10
movdqu xmm7, OWORD PTR [r10]
pshufb xmm7, xmm8
- xor eax, eax
+ xor r11d, r11d
cmp r8d, 64
- mov r10d, r8d
+ mov r12d, r8d
jl L_AES_CTR_encrypt_AESNI_done_64
- and r10d, 4294967232
+ and r12d, 4294967232
L_AES_CTR_encrypt_AESNI_enc_64:
; 64 bytes of input
; aes_ctr_enc_64
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
movdqa xmm0, xmm7
pshufb xmm0, xmm8
paddq xmm7, xmm9
@@ -1185,26 +1201,26 @@ L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last:
aesenclast xmm1, xmm4
aesenclast xmm2, xmm4
aesenclast xmm3, xmm4
- movdqu xmm4, OWORD PTR [r11]
+ movdqu xmm4, OWORD PTR [r13]
pxor xmm0, xmm4
- movdqu xmm4, OWORD PTR [r11+16]
+ movdqu xmm4, OWORD PTR [r13+16]
pxor xmm1, xmm4
- movdqu xmm4, OWORD PTR [r11+32]
+ movdqu xmm4, OWORD PTR [r13+32]
pxor xmm2, xmm4
- movdqu xmm4, OWORD PTR [r11+48]
+ movdqu xmm4, OWORD PTR [r13+48]
pxor xmm3, xmm4
- movdqu OWORD PTR [rbx], xmm0
- movdqu OWORD PTR [rbx+16], xmm1
- movdqu OWORD PTR [rbx+32], xmm2
- movdqu OWORD PTR [rbx+48], xmm3
- add eax, 64
- cmp eax, r10d
+ movdqu OWORD PTR [r14], xmm0
+ movdqu OWORD PTR [r14+16], xmm1
+ movdqu OWORD PTR [r14+32], xmm2
+ movdqu OWORD PTR [r14+48], xmm3
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_AESNI_enc_64
L_AES_CTR_encrypt_AESNI_done_64:
- cmp eax, r8d
- mov r10d, r8d
+ cmp r11d, r8d
+ mov r12d, r8d
je L_AES_CTR_encrypt_AESNI_done_enc
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CTR_encrypt_AESNI_enc_16:
; 16 bytes of input
movdqa xmm0, xmm7
@@ -1250,13 +1266,13 @@ L_AES_CTR_encrypt_AESNI_enc_16:
movdqu xmm5, OWORD PTR [r9+224]
L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last:
aesenclast xmm0, xmm5
- lea r11, QWORD PTR [rcx+rax]
- movdqu xmm4, OWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ movdqu xmm4, OWORD PTR [r13]
pxor xmm0, xmm4
- lea r11, QWORD PTR [rdx+rax]
- movdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rdx+r11]
+ movdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_AESNI_enc_16
L_AES_CTR_encrypt_AESNI_done_enc:
pshufb xmm7, xmm8
@@ -1268,30 +1284,34 @@ L_AES_CTR_encrypt_AESNI_done_enc:
movdqu xmm10, OWORD PTR [rsp+64]
movdqu xmm11, OWORD PTR [rsp+80]
add rsp, 96
- pop rbx
+ pop r14
+ pop r13
+ pop r12
ret
AES_CTR_encrypt_AESNI ENDP
_TEXT ENDS
IFDEF HAVE_INTEL_AVX1
_TEXT SEGMENT READONLY PARA
AES_ECB_encrypt_avx1 PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 16
vmovdqu OWORD PTR [rsp], xmm6
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_encrypt_avx1_done_64
- and r9d, 4294967232
+ and r11d, 4294967232
L_AES_ECB_encrypt_avx1_enc_64:
; 64 bytes of input
; aes_ecb_enc_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
- vmovdqu xmm1, OWORD PTR [r10+16]
- vmovdqu xmm2, OWORD PTR [r10+32]
- vmovdqu xmm3, OWORD PTR [r10+48]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
+ vmovdqu xmm1, OWORD PTR [r12+16]
+ vmovdqu xmm2, OWORD PTR [r12+32]
+ vmovdqu xmm3, OWORD PTR [r12+48]
; aes_enc_block
vmovdqu xmm4, OWORD PTR [r9]
vpxor xmm0, xmm0, xmm4
@@ -1373,22 +1393,22 @@ L_AES_ECB_encrypt_avx1_64_aes_enc_block_last:
vaesenclast xmm1, xmm1, xmm4
vaesenclast xmm2, xmm2, xmm4
vaesenclast xmm3, xmm3, xmm4
- vmovdqu OWORD PTR [r11], xmm0
- vmovdqu OWORD PTR [r11+16], xmm1
- vmovdqu OWORD PTR [r11+32], xmm2
- vmovdqu OWORD PTR [r11+48], xmm3
- add eax, 64
- cmp eax, r9d
+ vmovdqu OWORD PTR [r13], xmm0
+ vmovdqu OWORD PTR [r13+16], xmm1
+ vmovdqu OWORD PTR [r13+32], xmm2
+ vmovdqu OWORD PTR [r13+48], xmm3
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_avx1_enc_64
L_AES_ECB_encrypt_avx1_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_encrypt_avx1_done_enc
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_encrypt_avx1_enc_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_enc_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -1424,36 +1444,40 @@ L_AES_ECB_encrypt_avx1_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_encrypt_avx1_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_avx1_enc_16
L_AES_ECB_encrypt_avx1_done_enc:
vmovdqu xmm6, OWORD PTR [rsp]
add rsp, 16
+ pop r13
+ pop r12
ret
AES_ECB_encrypt_avx1 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_ECB_decrypt_avx1 PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 16
vmovdqu OWORD PTR [rsp], xmm6
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_decrypt_avx1_done_64
- and r9d, 4294967232
+ and r11d, 4294967232
L_AES_ECB_decrypt_avx1_dec_64:
; 64 bytes of input
; aes_ecb_dec_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
- vmovdqu xmm1, OWORD PTR [r10+16]
- vmovdqu xmm2, OWORD PTR [r10+32]
- vmovdqu xmm3, OWORD PTR [r10+48]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
+ vmovdqu xmm1, OWORD PTR [r12+16]
+ vmovdqu xmm2, OWORD PTR [r12+32]
+ vmovdqu xmm3, OWORD PTR [r12+48]
; aes_dec_block
vmovdqu xmm4, OWORD PTR [r9]
vpxor xmm0, xmm0, xmm4
@@ -1535,22 +1559,22 @@ L_AES_ECB_decrypt_avx1_64_aes_dec_block_last:
vaesdeclast xmm1, xmm1, xmm4
vaesdeclast xmm2, xmm2, xmm4
vaesdeclast xmm3, xmm3, xmm4
- vmovdqu OWORD PTR [r11], xmm0
- vmovdqu OWORD PTR [r11+16], xmm1
- vmovdqu OWORD PTR [r11+32], xmm2
- vmovdqu OWORD PTR [r11+48], xmm3
- add eax, 64
- cmp eax, r9d
+ vmovdqu OWORD PTR [r13], xmm0
+ vmovdqu OWORD PTR [r13+16], xmm1
+ vmovdqu OWORD PTR [r13+32], xmm2
+ vmovdqu OWORD PTR [r13+48], xmm3
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_avx1_dec_64
L_AES_ECB_decrypt_avx1_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_decrypt_avx1_done_dec
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_decrypt_avx1_dec_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_dec_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -1586,29 +1610,33 @@ L_AES_ECB_decrypt_avx1_dec_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_decrypt_avx1_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_avx1_dec_16
L_AES_ECB_decrypt_avx1_done_dec:
vmovdqu xmm6, OWORD PTR [rsp]
add rsp, 16
+ pop r13
+ pop r12
ret
AES_ECB_decrypt_avx1 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_encrypt_avx1 PROC
- mov rax, QWORD PTR [rsp+40]
- mov r10d, DWORD PTR [rsp+48]
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ mov r10d, DWORD PTR [rsp+64]
vmovdqu xmm0, OWORD PTR [r8]
- xor eax, eax
- cmp eax, r9d
+ xor r11d, r11d
+ cmp r11d, r9d
je L_AES_CBC_encrypt_avx1_done
L_AES_CBC_encrypt_avx1_loop:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm1, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r11]
+ vmovdqu xmm1, OWORD PTR [r12]
vpxor xmm1, xmm1, xmm0
; aes_enc_block
vpxor xmm1, xmm1, [rax]
@@ -1645,41 +1673,45 @@ L_AES_CBC_encrypt_avx1_loop:
vmovdqu xmm3, OWORD PTR [rax+224]
L_AES_CBC_encrypt_avx1_aes_enc_block_last:
vaesenclast xmm1, xmm1, xmm3
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm1
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm1
vmovdqa xmm0, xmm1
- add eax, 16
- cmp eax, r9d
+ add r11d, 16
+ cmp r11d, r9d
jl L_AES_CBC_encrypt_avx1_loop
L_AES_CBC_encrypt_avx1_done:
vmovdqu OWORD PTR [r8], xmm0
+ pop r13
+ pop r12
ret
AES_CBC_encrypt_avx1 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_decrypt_avx1 PROC
push r12
- mov rax, QWORD PTR [rsp+48]
- mov r10d, DWORD PTR [rsp+56]
+ push r13
+ push r14
+ mov rax, QWORD PTR [rsp+64]
+ mov r10d, DWORD PTR [rsp+72]
sub rsp, 48
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
vmovdqu OWORD PTR [rsp+32], xmm8
vmovdqu xmm4, OWORD PTR [r8]
- xor eax, eax
+ xor r11d, r11d
cmp r9d, 64
- mov r10d, r9d
+ mov r12d, r9d
jl L_AES_CBC_decrypt_avx1_done_64
- and r10d, 4294967232
+ and r12d, 4294967232
L_AES_CBC_decrypt_avx1_dec_64:
; 64 bytes of input
; aes_cbc_dec_64
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- vmovdqu xmm0, OWORD PTR [r11]
- vmovdqu xmm1, OWORD PTR [r11+16]
- vmovdqu xmm2, OWORD PTR [r11+32]
- vmovdqu xmm3, OWORD PTR [r11+48]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ vmovdqu xmm0, OWORD PTR [r13]
+ vmovdqu xmm1, OWORD PTR [r13+16]
+ vmovdqu xmm2, OWORD PTR [r13+32]
+ vmovdqu xmm3, OWORD PTR [r13+48]
; aes_dec_block
vmovdqu xmm5, OWORD PTR [rax]
vpxor xmm0, xmm0, xmm5
@@ -1762,26 +1794,26 @@ L_AES_CBC_decrypt_avx1_64_aes_dec_block_last:
vaesdeclast xmm2, xmm2, xmm5
vaesdeclast xmm3, xmm3, xmm5
vpxor xmm0, xmm0, xmm4
- vpxor xmm1, xmm1, [r11]
- vpxor xmm2, xmm2, [r11+16]
- vpxor xmm3, xmm3, [r11+32]
- vmovdqu xmm4, OWORD PTR [r11+48]
- vmovdqu OWORD PTR [r12], xmm0
- vmovdqu OWORD PTR [r12+16], xmm1
- vmovdqu OWORD PTR [r12+32], xmm2
- vmovdqu OWORD PTR [r12+48], xmm3
- add eax, 64
- cmp eax, r10d
+ vpxor xmm1, xmm1, [r13]
+ vpxor xmm2, xmm2, [r13+16]
+ vpxor xmm3, xmm3, [r13+32]
+ vmovdqu xmm4, OWORD PTR [r13+48]
+ vmovdqu OWORD PTR [r14], xmm0
+ vmovdqu OWORD PTR [r14+16], xmm1
+ vmovdqu OWORD PTR [r14+32], xmm2
+ vmovdqu OWORD PTR [r14+48], xmm3
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_avx1_dec_64
L_AES_CBC_decrypt_avx1_done_64:
- cmp eax, r9d
- mov r10d, r9d
+ cmp r11d, r9d
+ mov r12d, r9d
je L_AES_CBC_decrypt_avx1_done_dec
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CBC_decrypt_avx1_dec_16:
; 16 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ vmovdqu xmm0, OWORD PTR [r13]
vmovdqa xmm8, xmm0
; aes_dec_block
vpxor xmm0, xmm0, [rax]
@@ -1820,10 +1852,10 @@ L_AES_CBC_decrypt_avx1_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm6
vpxor xmm0, xmm0, xmm4
vmovdqa xmm4, xmm8
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_avx1_dec_16
L_AES_CBC_decrypt_avx1_done_dec:
vmovdqu OWORD PTR [r8], xmm4
@@ -1831,27 +1863,29 @@ L_AES_CBC_decrypt_avx1_done_dec:
vmovdqu xmm7, OWORD PTR [rsp+16]
vmovdqu xmm8, OWORD PTR [rsp+32]
add rsp, 48
+ pop r14
+ pop r13
pop r12
ret
AES_CBC_decrypt_avx1 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_avx1_bswap QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_aes_ctr_avx1_bswap QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_aes_ctr_avx1_bswap QWORD L_aes_ctr_avx1_bswap
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_avx1_one QWORD \
- 0000000000000001h, 0000000000000000h
+L_aes_ctr_avx1_one QWORD 0000000000000001h, 0000000000000000h
ptr_L_aes_ctr_avx1_one QWORD L_aes_ctr_avx1_one
_DATA ENDS
_TEXT SEGMENT READONLY PARA
AES_CTR_encrypt_avx1 PROC
- push rbx
- mov eax, DWORD PTR [rsp+48]
- mov r10, QWORD PTR [rsp+56]
+ push r12
+ push r13
+ push r14
+ mov eax, DWORD PTR [rsp+64]
+ mov r10, QWORD PTR [rsp+72]
sub rsp, 96
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -1864,16 +1898,16 @@ AES_CTR_encrypt_avx1 PROC
vpxor xmm10, xmm10, xmm10
vmovdqu xmm7, OWORD PTR [r10]
vpshufb xmm7, xmm7, xmm8
- xor eax, eax
+ xor r11d, r11d
cmp r8d, 64
- mov r10d, r8d
+ mov r12d, r8d
jl L_AES_CTR_encrypt_avx1_done_64
- and r10d, 4294967232
+ and r12d, 4294967232
L_AES_CTR_encrypt_avx1_enc_64:
; 64 bytes of input
; aes_ctr_enc_64
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
vpshufb xmm0, xmm7, xmm8
vpaddq xmm7, xmm7, xmm9
vpcmpeqq xmm11, xmm7, xmm10
@@ -1979,22 +2013,22 @@ L_AES_CTR_encrypt_avx1_64_aes_enc_block_last:
vaesenclast xmm1, xmm1, xmm4
vaesenclast xmm2, xmm2, xmm4
vaesenclast xmm3, xmm3, xmm4
- vpxor xmm0, xmm0, [r11]
- vpxor xmm1, xmm1, [r11+16]
- vpxor xmm2, xmm2, [r11+32]
- vpxor xmm3, xmm3, [r11+48]
- vmovdqu OWORD PTR [rbx], xmm0
- vmovdqu OWORD PTR [rbx+16], xmm1
- vmovdqu OWORD PTR [rbx+32], xmm2
- vmovdqu OWORD PTR [rbx+48], xmm3
- add eax, 64
- cmp eax, r10d
+ vpxor xmm0, xmm0, [r13]
+ vpxor xmm1, xmm1, [r13+16]
+ vpxor xmm2, xmm2, [r13+32]
+ vpxor xmm3, xmm3, [r13+48]
+ vmovdqu OWORD PTR [r14], xmm0
+ vmovdqu OWORD PTR [r14+16], xmm1
+ vmovdqu OWORD PTR [r14+32], xmm2
+ vmovdqu OWORD PTR [r14+48], xmm3
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_avx1_enc_64
L_AES_CTR_encrypt_avx1_done_64:
- cmp eax, r8d
- mov r10d, r8d
+ cmp r11d, r8d
+ mov r12d, r8d
je L_AES_CTR_encrypt_avx1_done_enc
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CTR_encrypt_avx1_enc_16:
; 16 bytes of input
vpshufb xmm0, xmm7, xmm8
@@ -2038,12 +2072,12 @@ L_AES_CTR_encrypt_avx1_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_CTR_encrypt_avx1_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r11, QWORD PTR [rcx+rax]
- vpxor xmm0, xmm0, [r11]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rcx+r11]
+ vpxor xmm0, xmm0, [r13]
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_avx1_enc_16
L_AES_CTR_encrypt_avx1_done_enc:
vpshufb xmm7, xmm7, xmm8
@@ -2055,7 +2089,9 @@ L_AES_CTR_encrypt_avx1_done_enc:
vmovdqu xmm10, OWORD PTR [rsp+64]
vmovdqu xmm11, OWORD PTR [rsp+80]
add rsp, 96
- pop rbx
+ pop r14
+ pop r13
+ pop r12
ret
AES_CTR_encrypt_avx1 ENDP
_TEXT ENDS
@@ -2063,172 +2099,174 @@ ENDIF
IFDEF HAVE_INTEL_VAES
_TEXT SEGMENT READONLY PARA
AES_ECB_encrypt_vaes PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 32
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 128
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_encrypt_vaes_done_128
- and r9d, 4294967168
+ and r11d, 4294967168
L_AES_ECB_encrypt_vaes_enc_128:
; 128 bytes of input
; aes_ecb_enc_128
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r10]
- vmovdqu ymm1, YMMWORD PTR [r10+32]
- vmovdqu ymm2, YMMWORD PTR [r10+64]
- vmovdqu ymm3, YMMWORD PTR [r10+96]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
; aes_enc_block
- vbroadcasti128 ymm7, [r9]
+ vbroadcasti128 ymm7, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm7
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm7
vpxor ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+16]
+ vbroadcasti128 ymm7, OWORD PTR [r9+16]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+32]
+ vbroadcasti128 ymm7, OWORD PTR [r9+32]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+48]
+ vbroadcasti128 ymm7, OWORD PTR [r9+48]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+64]
+ vbroadcasti128 ymm7, OWORD PTR [r9+64]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+80]
+ vbroadcasti128 ymm7, OWORD PTR [r9+80]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+96]
+ vbroadcasti128 ymm7, OWORD PTR [r9+96]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+112]
+ vbroadcasti128 ymm7, OWORD PTR [r9+112]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+128]
+ vbroadcasti128 ymm7, OWORD PTR [r9+128]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+144]
+ vbroadcasti128 ymm7, OWORD PTR [r9+144]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
cmp eax, 11
- vbroadcasti128 ymm7, [r9+160]
+ vbroadcasti128 ymm7, OWORD PTR [r9+160]
jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+176]
+ vbroadcasti128 ymm7, OWORD PTR [r9+176]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
cmp eax, 13
- vbroadcasti128 ymm7, [r9+192]
+ vbroadcasti128 ymm7, OWORD PTR [r9+192]
jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+208]
+ vbroadcasti128 ymm7, OWORD PTR [r9+208]
vaesenc ymm0, ymm0, ymm7
vaesenc ymm1, ymm1, ymm7
vaesenc ymm2, ymm2, ymm7
vaesenc ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+224]
+ vbroadcasti128 ymm7, OWORD PTR [r9+224]
L_AES_ECB_encrypt_vaes_128_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm7
vaesenclast ymm1, ymm1, ymm7
vaesenclast ymm2, ymm2, ymm7
vaesenclast ymm3, ymm3, ymm7
- vmovdqu YMMWORD PTR [r11], ymm0
- vmovdqu YMMWORD PTR [r11+32], ymm1
- vmovdqu YMMWORD PTR [r11+64], ymm2
- vmovdqu YMMWORD PTR [r11+96], ymm3
- add eax, 128
- cmp eax, r9d
+ vmovdqu YMMWORD PTR [r13], ymm0
+ vmovdqu YMMWORD PTR [r13+32], ymm1
+ vmovdqu YMMWORD PTR [r13+64], ymm2
+ vmovdqu YMMWORD PTR [r13+96], ymm3
+ add r10d, 128
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_vaes_enc_128
L_AES_ECB_encrypt_vaes_done_128:
- mov r9d, r8d
- and r9d, 4294967264
- cmp eax, r9d
+ mov r11d, r8d
+ and r11d, 4294967264
+ cmp r10d, r11d
je L_AES_ECB_encrypt_vaes_done_32
L_AES_ECB_encrypt_vaes_enc_32:
; 32 bytes of input
; aes_ecb_enc_32
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu ymm0, YMMWORD PTR [r12]
; aes_enc_block
- vbroadcasti128 ymm7, [r9]
+ vbroadcasti128 ymm7, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+16]
+ vbroadcasti128 ymm7, OWORD PTR [r9+16]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+32]
+ vbroadcasti128 ymm7, OWORD PTR [r9+32]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+48]
+ vbroadcasti128 ymm7, OWORD PTR [r9+48]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+64]
+ vbroadcasti128 ymm7, OWORD PTR [r9+64]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+80]
+ vbroadcasti128 ymm7, OWORD PTR [r9+80]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+96]
+ vbroadcasti128 ymm7, OWORD PTR [r9+96]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+112]
+ vbroadcasti128 ymm7, OWORD PTR [r9+112]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+128]
+ vbroadcasti128 ymm7, OWORD PTR [r9+128]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+144]
+ vbroadcasti128 ymm7, OWORD PTR [r9+144]
vaesenc ymm0, ymm0, ymm7
cmp eax, 11
- vbroadcasti128 ymm7, [r9+160]
+ vbroadcasti128 ymm7, OWORD PTR [r9+160]
jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+176]
+ vbroadcasti128 ymm7, OWORD PTR [r9+176]
vaesenc ymm0, ymm0, ymm7
cmp eax, 13
- vbroadcasti128 ymm7, [r9+192]
+ vbroadcasti128 ymm7, OWORD PTR [r9+192]
jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+208]
+ vbroadcasti128 ymm7, OWORD PTR [r9+208]
vaesenc ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+224]
+ vbroadcasti128 ymm7, OWORD PTR [r9+224]
L_AES_ECB_encrypt_vaes_32_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm7
- vmovdqu YMMWORD PTR [r11], ymm0
- add eax, 32
- cmp eax, r9d
+ vmovdqu YMMWORD PTR [r13], ymm0
+ add r10d, 32
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_vaes_enc_32
L_AES_ECB_encrypt_vaes_done_32:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_encrypt_vaes_done_enc
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_encrypt_vaes_enc_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_enc_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -2264,186 +2302,190 @@ L_AES_ECB_encrypt_vaes_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_encrypt_vaes_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_vaes_enc_16
L_AES_ECB_encrypt_vaes_done_enc:
vmovdqu xmm6, OWORD PTR [rsp]
vmovdqu xmm7, OWORD PTR [rsp+16]
add rsp, 32
+ pop r13
+ pop r12
ret
AES_ECB_encrypt_vaes ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_ECB_decrypt_vaes PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 32
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 128
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_decrypt_vaes_done_128
- and r9d, 4294967168
+ and r11d, 4294967168
L_AES_ECB_decrypt_vaes_dec_128:
; 128 bytes of input
; aes_ecb_dec_128
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r10]
- vmovdqu ymm1, YMMWORD PTR [r10+32]
- vmovdqu ymm2, YMMWORD PTR [r10+64]
- vmovdqu ymm3, YMMWORD PTR [r10+96]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
; aes_dec_block
- vbroadcasti128 ymm7, [r9]
+ vbroadcasti128 ymm7, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm7
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm7
vpxor ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+16]
+ vbroadcasti128 ymm7, OWORD PTR [r9+16]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+32]
+ vbroadcasti128 ymm7, OWORD PTR [r9+32]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+48]
+ vbroadcasti128 ymm7, OWORD PTR [r9+48]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+64]
+ vbroadcasti128 ymm7, OWORD PTR [r9+64]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+80]
+ vbroadcasti128 ymm7, OWORD PTR [r9+80]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+96]
+ vbroadcasti128 ymm7, OWORD PTR [r9+96]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+112]
+ vbroadcasti128 ymm7, OWORD PTR [r9+112]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+128]
+ vbroadcasti128 ymm7, OWORD PTR [r9+128]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+144]
+ vbroadcasti128 ymm7, OWORD PTR [r9+144]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
cmp eax, 11
- vbroadcasti128 ymm7, [r9+160]
+ vbroadcasti128 ymm7, OWORD PTR [r9+160]
jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+176]
+ vbroadcasti128 ymm7, OWORD PTR [r9+176]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
cmp eax, 13
- vbroadcasti128 ymm7, [r9+192]
+ vbroadcasti128 ymm7, OWORD PTR [r9+192]
jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+208]
+ vbroadcasti128 ymm7, OWORD PTR [r9+208]
vaesdec ymm0, ymm0, ymm7
vaesdec ymm1, ymm1, ymm7
vaesdec ymm2, ymm2, ymm7
vaesdec ymm3, ymm3, ymm7
- vbroadcasti128 ymm7, [r9+224]
+ vbroadcasti128 ymm7, OWORD PTR [r9+224]
L_AES_ECB_decrypt_vaes_128_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm7
vaesdeclast ymm1, ymm1, ymm7
vaesdeclast ymm2, ymm2, ymm7
vaesdeclast ymm3, ymm3, ymm7
- vmovdqu YMMWORD PTR [r11], ymm0
- vmovdqu YMMWORD PTR [r11+32], ymm1
- vmovdqu YMMWORD PTR [r11+64], ymm2
- vmovdqu YMMWORD PTR [r11+96], ymm3
- add eax, 128
- cmp eax, r9d
+ vmovdqu YMMWORD PTR [r13], ymm0
+ vmovdqu YMMWORD PTR [r13+32], ymm1
+ vmovdqu YMMWORD PTR [r13+64], ymm2
+ vmovdqu YMMWORD PTR [r13+96], ymm3
+ add r10d, 128
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_vaes_dec_128
L_AES_ECB_decrypt_vaes_done_128:
- mov r9d, r8d
- and r9d, 4294967264
- cmp eax, r9d
+ mov r11d, r8d
+ and r11d, 4294967264
+ cmp r10d, r11d
je L_AES_ECB_decrypt_vaes_done_32
L_AES_ECB_decrypt_vaes_dec_32:
; 32 bytes of input
; aes_ecb_dec_32
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu ymm0, YMMWORD PTR [r12]
; aes_dec_block
- vbroadcasti128 ymm7, [r9]
+ vbroadcasti128 ymm7, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+16]
+ vbroadcasti128 ymm7, OWORD PTR [r9+16]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+32]
+ vbroadcasti128 ymm7, OWORD PTR [r9+32]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+48]
+ vbroadcasti128 ymm7, OWORD PTR [r9+48]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+64]
+ vbroadcasti128 ymm7, OWORD PTR [r9+64]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+80]
+ vbroadcasti128 ymm7, OWORD PTR [r9+80]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+96]
+ vbroadcasti128 ymm7, OWORD PTR [r9+96]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+112]
+ vbroadcasti128 ymm7, OWORD PTR [r9+112]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+128]
+ vbroadcasti128 ymm7, OWORD PTR [r9+128]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+144]
+ vbroadcasti128 ymm7, OWORD PTR [r9+144]
vaesdec ymm0, ymm0, ymm7
cmp eax, 11
- vbroadcasti128 ymm7, [r9+160]
+ vbroadcasti128 ymm7, OWORD PTR [r9+160]
jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+176]
+ vbroadcasti128 ymm7, OWORD PTR [r9+176]
vaesdec ymm0, ymm0, ymm7
cmp eax, 13
- vbroadcasti128 ymm7, [r9+192]
+ vbroadcasti128 ymm7, OWORD PTR [r9+192]
jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+208]
+ vbroadcasti128 ymm7, OWORD PTR [r9+208]
vaesdec ymm0, ymm0, ymm7
- vbroadcasti128 ymm7, [r9+224]
+ vbroadcasti128 ymm7, OWORD PTR [r9+224]
L_AES_ECB_decrypt_vaes_32_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm7
- vmovdqu YMMWORD PTR [r11], ymm0
- add eax, 32
- cmp eax, r9d
+ vmovdqu YMMWORD PTR [r13], ymm0
+ add r10d, 32
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_vaes_dec_32
L_AES_ECB_decrypt_vaes_done_32:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_decrypt_vaes_done_dec
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_decrypt_vaes_dec_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_dec_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -2479,30 +2521,34 @@ L_AES_ECB_decrypt_vaes_dec_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_decrypt_vaes_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_vaes_dec_16
L_AES_ECB_decrypt_vaes_done_dec:
vmovdqu xmm6, OWORD PTR [rsp]
vmovdqu xmm7, OWORD PTR [rsp+16]
add rsp, 32
+ pop r13
+ pop r12
ret
AES_ECB_decrypt_vaes ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_encrypt_vaes PROC
- mov rax, QWORD PTR [rsp+40]
- mov r10d, DWORD PTR [rsp+48]
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ mov r10d, DWORD PTR [rsp+64]
vmovdqu xmm0, OWORD PTR [r8]
- xor eax, eax
- cmp eax, r9d
+ xor r11d, r11d
+ cmp r11d, r9d
je L_AES_CBC_encrypt_vaes_done
L_AES_CBC_encrypt_vaes_loop:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm1, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r11]
+ vmovdqu xmm1, OWORD PTR [r12]
vpxor xmm1, xmm1, xmm0
; aes_enc_block
vpxor xmm1, xmm1, [rax]
@@ -2539,22 +2585,26 @@ L_AES_CBC_encrypt_vaes_loop:
vmovdqu xmm3, OWORD PTR [rax+224]
L_AES_CBC_encrypt_vaes_aes_enc_block_last:
vaesenclast xmm1, xmm1, xmm3
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm1
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm1
vmovdqa xmm0, xmm1
- add eax, 16
- cmp eax, r9d
+ add r11d, 16
+ cmp r11d, r9d
jl L_AES_CBC_encrypt_vaes_loop
L_AES_CBC_encrypt_vaes_done:
vmovdqu OWORD PTR [r8], xmm0
+ pop r13
+ pop r12
ret
AES_CBC_encrypt_vaes ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_decrypt_vaes PROC
push r12
- mov rax, QWORD PTR [rsp+48]
- mov r10d, DWORD PTR [rsp+56]
+ push r13
+ push r14
+ mov rax, QWORD PTR [rsp+64]
+ mov r10d, DWORD PTR [rsp+72]
sub rsp, 128
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -2565,101 +2615,101 @@ AES_CBC_decrypt_vaes PROC
vmovdqu OWORD PTR [rsp+96], xmm12
vmovdqu OWORD PTR [rsp+112], xmm13
vmovdqu xmm8, OWORD PTR [r8]
- xor eax, eax
+ xor r11d, r11d
cmp r9d, 128
- mov r10d, r9d
+ mov r12d, r9d
jl L_AES_CBC_decrypt_vaes_done_128
- and r10d, 4294967168
+ and r12d, 4294967168
L_AES_CBC_decrypt_vaes_dec_128:
; 128 bytes of input
; aes_cbc_dec_128
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r11]
- vmovdqu ymm1, YMMWORD PTR [r11+32]
- vmovdqu ymm2, YMMWORD PTR [r11+64]
- vmovdqu ymm3, YMMWORD PTR [r11+96]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ vmovdqu ymm0, YMMWORD PTR [r13]
+ vmovdqu ymm1, YMMWORD PTR [r13+32]
+ vmovdqu ymm2, YMMWORD PTR [r13+64]
+ vmovdqu ymm3, YMMWORD PTR [r13+96]
vinserti128 ymm10, ymm8, xmm0, 1
- vmovdqu ymm11, YMMWORD PTR [r11+16]
- vmovdqu ymm12, YMMWORD PTR [r11+48]
- vmovdqu ymm13, YMMWORD PTR [r11+80]
+ vmovdqu ymm11, YMMWORD PTR [r13+16]
+ vmovdqu ymm12, YMMWORD PTR [r13+48]
+ vmovdqu ymm13, YMMWORD PTR [r13+80]
vextracti128 xmm8, ymm3, 1
; aes_dec_block
- vbroadcasti128 ymm9, [rax]
+ vbroadcasti128 ymm9, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm9
vpxor ymm2, ymm2, ymm9
vpxor ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+16]
+ vbroadcasti128 ymm9, OWORD PTR [rax+16]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+32]
+ vbroadcasti128 ymm9, OWORD PTR [rax+32]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+48]
+ vbroadcasti128 ymm9, OWORD PTR [rax+48]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+64]
+ vbroadcasti128 ymm9, OWORD PTR [rax+64]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+80]
+ vbroadcasti128 ymm9, OWORD PTR [rax+80]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+96]
+ vbroadcasti128 ymm9, OWORD PTR [rax+96]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+112]
+ vbroadcasti128 ymm9, OWORD PTR [rax+112]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+128]
+ vbroadcasti128 ymm9, OWORD PTR [rax+128]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+144]
+ vbroadcasti128 ymm9, OWORD PTR [rax+144]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [rax+160]
+ vbroadcasti128 ymm9, OWORD PTR [rax+160]
jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+176]
+ vbroadcasti128 ymm9, OWORD PTR [rax+176]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [rax+192]
+ vbroadcasti128 ymm9, OWORD PTR [rax+192]
jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+208]
+ vbroadcasti128 ymm9, OWORD PTR [rax+208]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [rax+224]
+ vbroadcasti128 ymm9, OWORD PTR [rax+224]
L_AES_CBC_decrypt_vaes_128_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vaesdeclast ymm1, ymm1, ymm9
@@ -2669,76 +2719,76 @@ L_AES_CBC_decrypt_vaes_128_aes_dec_block_last:
vpxor ymm1, ymm1, ymm11
vpxor ymm2, ymm2, ymm12
vpxor ymm3, ymm3, ymm13
- vmovdqu YMMWORD PTR [r12], ymm0
- vmovdqu YMMWORD PTR [r12+32], ymm1
- vmovdqu YMMWORD PTR [r12+64], ymm2
- vmovdqu YMMWORD PTR [r12+96], ymm3
- add eax, 128
- cmp eax, r10d
+ vmovdqu YMMWORD PTR [r14], ymm0
+ vmovdqu YMMWORD PTR [r14+32], ymm1
+ vmovdqu YMMWORD PTR [r14+64], ymm2
+ vmovdqu YMMWORD PTR [r14+96], ymm3
+ add r11d, 128
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_vaes_dec_128
L_AES_CBC_decrypt_vaes_done_128:
- mov r10d, r9d
- and r10d, 4294967264
- cmp eax, r10d
+ mov r12d, r9d
+ and r12d, 4294967264
+ cmp r11d, r12d
je L_AES_CBC_decrypt_vaes_done_32
L_AES_CBC_decrypt_vaes_dec_32:
; 32 bytes of input
; aes_cbc_dec_32
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- vmovdqu ymm0, YMMWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ vmovdqu ymm0, YMMWORD PTR [r13]
vinserti128 ymm10, ymm8, xmm0, 1
vextracti128 xmm8, ymm0, 1
; aes_dec_block
- vbroadcasti128 ymm9, [rax]
+ vbroadcasti128 ymm9, OWORD PTR [rax]
vpxor ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+16]
+ vbroadcasti128 ymm9, OWORD PTR [rax+16]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+32]
+ vbroadcasti128 ymm9, OWORD PTR [rax+32]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+48]
+ vbroadcasti128 ymm9, OWORD PTR [rax+48]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+64]
+ vbroadcasti128 ymm9, OWORD PTR [rax+64]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+80]
+ vbroadcasti128 ymm9, OWORD PTR [rax+80]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+96]
+ vbroadcasti128 ymm9, OWORD PTR [rax+96]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+112]
+ vbroadcasti128 ymm9, OWORD PTR [rax+112]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+128]
+ vbroadcasti128 ymm9, OWORD PTR [rax+128]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+144]
+ vbroadcasti128 ymm9, OWORD PTR [rax+144]
vaesdec ymm0, ymm0, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [rax+160]
+ vbroadcasti128 ymm9, OWORD PTR [rax+160]
jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+176]
+ vbroadcasti128 ymm9, OWORD PTR [rax+176]
vaesdec ymm0, ymm0, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [rax+192]
+ vbroadcasti128 ymm9, OWORD PTR [rax+192]
jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+208]
+ vbroadcasti128 ymm9, OWORD PTR [rax+208]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [rax+224]
+ vbroadcasti128 ymm9, OWORD PTR [rax+224]
L_AES_CBC_decrypt_vaes_32_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vpxor ymm0, ymm0, ymm10
- vmovdqu YMMWORD PTR [r12], ymm0
- add eax, 32
- cmp eax, r10d
+ vmovdqu YMMWORD PTR [r14], ymm0
+ add r11d, 32
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_vaes_dec_32
L_AES_CBC_decrypt_vaes_done_32:
- cmp eax, r9d
- mov r10d, r9d
+ cmp r11d, r9d
+ mov r12d, r9d
je L_AES_CBC_decrypt_vaes_done_dec
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CBC_decrypt_vaes_dec_16:
; 16 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ vmovdqu xmm0, OWORD PTR [r13]
vmovdqa xmm7, xmm0
; aes_dec_block
vpxor xmm0, xmm0, [rax]
@@ -2777,10 +2827,10 @@ L_AES_CBC_decrypt_vaes_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm5
vpxor xmm0, xmm0, xmm8
vmovdqa xmm8, xmm7
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_vaes_dec_16
L_AES_CBC_decrypt_vaes_done_dec:
vmovdqu OWORD PTR [r8], xmm8
@@ -2793,43 +2843,45 @@ L_AES_CBC_decrypt_vaes_done_dec:
vmovdqu xmm12, OWORD PTR [rsp+96]
vmovdqu xmm13, OWORD PTR [rsp+112]
add rsp, 128
+ pop r14
+ pop r13
pop r12
ret
AES_CBC_decrypt_vaes ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_bswap_vaes QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_aes_ctr_bswap_vaes QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_aes_ctr_bswap_vaes QWORD L_aes_ctr_bswap_vaes
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_inc_vaes QWORD \
- 0000000000000000h, 0000000000000000h,
- 0000000000000001h, 0000000000000000h,
- 0000000000000002h, 0000000000000000h,
- 0000000000000003h, 0000000000000000h,
- 0000000000000004h, 0000000000000000h,
- 0000000000000005h, 0000000000000000h,
- 0000000000000006h, 0000000000000000h,
- 0000000000000007h, 0000000000000000h,
- 0000000000000008h, 0000000000000000h,
- 0000000000000009h, 0000000000000000h,
- 000000000000000ah, 0000000000000000h,
- 000000000000000bh, 0000000000000000h,
- 000000000000000ch, 0000000000000000h,
- 000000000000000dh, 0000000000000000h,
- 000000000000000eh, 0000000000000000h,
- 000000000000000fh, 0000000000000000h,
- 0000000000000010h, 0000000000000000h
+L_aes_ctr_inc_vaes QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000001h, 0000000000000000h
+ QWORD 0000000000000002h, 0000000000000000h
+ QWORD 0000000000000003h, 0000000000000000h
+ QWORD 0000000000000004h, 0000000000000000h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000000000008h, 0000000000000000h
+ QWORD 0000000000000009h, 0000000000000000h
+ QWORD 000000000000000ah, 0000000000000000h
+ QWORD 000000000000000bh, 0000000000000000h
+ QWORD 000000000000000ch, 0000000000000000h
+ QWORD 000000000000000dh, 0000000000000000h
+ QWORD 000000000000000eh, 0000000000000000h
+ QWORD 000000000000000fh, 0000000000000000h
+ QWORD 0000000000000010h, 0000000000000000h
ptr_L_aes_ctr_inc_vaes QWORD L_aes_ctr_inc_vaes
_DATA ENDS
_TEXT SEGMENT READONLY PARA
AES_CTR_encrypt_vaes PROC
- push rbx
- mov eax, DWORD PTR [rsp+48]
- mov r10, QWORD PTR [rsp+56]
+ push r12
+ push r13
+ push r14
+ mov eax, DWORD PTR [rsp+64]
+ mov r10, QWORD PTR [rsp+72]
sub rsp, 144
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -2841,16 +2893,16 @@ AES_CTR_encrypt_vaes PROC
vmovdqu OWORD PTR [rsp+112], xmm13
vmovdqu OWORD PTR [rsp+128], xmm14
vbroadcasti128 ymm8, ptr_L_aes_ctr_bswap_vaes
- vbroadcasti128 ymm7, [r10]
+ vbroadcasti128 ymm7, OWORD PTR [r10]
vpshufb ymm7, ymm7, ymm8
- vbroadcasti128 ymm10, [ptr_L_aes_ctr_inc_vaes+128]
- vbroadcasti128 ymm11, [ptr_L_aes_ctr_inc_vaes+32]
- vbroadcasti128 ymm12, [ptr_L_aes_ctr_inc_vaes+16]
- xor eax, eax
+ vbroadcasti128 ymm10, OWORD PTR [ptr_L_aes_ctr_inc_vaes+128]
+ vbroadcasti128 ymm11, OWORD PTR [ptr_L_aes_ctr_inc_vaes+32]
+ vbroadcasti128 ymm12, OWORD PTR [ptr_L_aes_ctr_inc_vaes+16]
+ xor r11d, r11d
cmp r8d, 128
- mov r10d, r8d
+ mov r12d, r8d
jl L_AES_CTR_encrypt_vaes_done_128
- and r10d, 4294967168
+ and r12d, 4294967168
vmovdqa ymm9, ymm7
vpaddq ymm4, ymm7, [ptr_L_aes_ctr_inc_vaes]
vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes]
@@ -2889,8 +2941,8 @@ AES_CTR_encrypt_vaes PROC
vpaddq ymm7, ymm7, ymm9
L_AES_CTR_encrypt_vaes_enc_128:
; 128 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
vpshufb ymm0, ymm4, ymm8
vpshufb ymm1, ymm5, ymm8
vpshufb ymm2, ymm6, ymm8
@@ -2932,108 +2984,108 @@ L_AES_CTR_encrypt_vaes_enc_128:
vpslldq ymm9, ymm9, 8
vpaddq ymm7, ymm7, ymm9
; aes_enc_block
- vbroadcasti128 ymm13, [r9]
+ vbroadcasti128 ymm13, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm13
vpxor ymm1, ymm1, ymm13
vpxor ymm2, ymm2, ymm13
vpxor ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+16]
+ vbroadcasti128 ymm13, OWORD PTR [r9+16]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+32]
+ vbroadcasti128 ymm13, OWORD PTR [r9+32]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+48]
+ vbroadcasti128 ymm13, OWORD PTR [r9+48]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+64]
+ vbroadcasti128 ymm13, OWORD PTR [r9+64]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+80]
+ vbroadcasti128 ymm13, OWORD PTR [r9+80]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+96]
+ vbroadcasti128 ymm13, OWORD PTR [r9+96]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+112]
+ vbroadcasti128 ymm13, OWORD PTR [r9+112]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+128]
+ vbroadcasti128 ymm13, OWORD PTR [r9+128]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+144]
+ vbroadcasti128 ymm13, OWORD PTR [r9+144]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
cmp eax, 11
- vbroadcasti128 ymm13, [r9+160]
+ vbroadcasti128 ymm13, OWORD PTR [r9+160]
jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+176]
+ vbroadcasti128 ymm13, OWORD PTR [r9+176]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
cmp eax, 13
- vbroadcasti128 ymm13, [r9+192]
+ vbroadcasti128 ymm13, OWORD PTR [r9+192]
jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+208]
+ vbroadcasti128 ymm13, OWORD PTR [r9+208]
vaesenc ymm0, ymm0, ymm13
vaesenc ymm1, ymm1, ymm13
vaesenc ymm2, ymm2, ymm13
vaesenc ymm3, ymm3, ymm13
- vbroadcasti128 ymm13, [r9+224]
+ vbroadcasti128 ymm13, OWORD PTR [r9+224]
L_AES_CTR_encrypt_vaes_128_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm13
vaesenclast ymm1, ymm1, ymm13
vaesenclast ymm2, ymm2, ymm13
vaesenclast ymm3, ymm3, ymm13
- vpxor ymm0, ymm0, [r11]
- vpxor ymm1, ymm1, [r11+32]
- vpxor ymm2, ymm2, [r11+64]
- vpxor ymm3, ymm3, [r11+96]
- vmovdqu YMMWORD PTR [rbx], ymm0
- vmovdqu YMMWORD PTR [rbx+32], ymm1
- vmovdqu YMMWORD PTR [rbx+64], ymm2
- vmovdqu YMMWORD PTR [rbx+96], ymm3
- add eax, 128
- cmp eax, r10d
+ vpxor ymm0, ymm0, [r13]
+ vpxor ymm1, ymm1, [r13+32]
+ vpxor ymm2, ymm2, [r13+64]
+ vpxor ymm3, ymm3, [r13+96]
+ vmovdqu YMMWORD PTR [r14], ymm0
+ vmovdqu YMMWORD PTR [r14+32], ymm1
+ vmovdqu YMMWORD PTR [r14+64], ymm2
+ vmovdqu YMMWORD PTR [r14+96], ymm3
+ add r11d, 128
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_vaes_enc_128
vperm2i128 ymm7, ymm4, ymm4, 0
L_AES_CTR_encrypt_vaes_done_128:
- mov r10d, r8d
- and r10d, 4294967264
- cmp eax, r10d
+ mov r12d, r8d
+ and r12d, 4294967264
+ cmp r11d, r12d
je L_AES_CTR_encrypt_vaes_done_32
L_AES_CTR_encrypt_vaes_enc_32:
; 32 bytes of input
; aes_ctr_enc_32
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
vpaddq ymm0, ymm7, [ptr_L_aes_ctr_inc_vaes]
vmovdqa ymm9, ymm7
vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes]
@@ -3054,51 +3106,51 @@ L_AES_CTR_encrypt_vaes_enc_32:
vpslldq ymm9, ymm9, 8
vpaddq ymm7, ymm7, ymm9
; aes_enc_block
- vbroadcasti128 ymm13, [r9]
+ vbroadcasti128 ymm13, OWORD PTR [r9]
vpxor ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+16]
+ vbroadcasti128 ymm13, OWORD PTR [r9+16]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+32]
+ vbroadcasti128 ymm13, OWORD PTR [r9+32]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+48]
+ vbroadcasti128 ymm13, OWORD PTR [r9+48]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+64]
+ vbroadcasti128 ymm13, OWORD PTR [r9+64]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+80]
+ vbroadcasti128 ymm13, OWORD PTR [r9+80]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+96]
+ vbroadcasti128 ymm13, OWORD PTR [r9+96]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+112]
+ vbroadcasti128 ymm13, OWORD PTR [r9+112]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+128]
+ vbroadcasti128 ymm13, OWORD PTR [r9+128]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+144]
+ vbroadcasti128 ymm13, OWORD PTR [r9+144]
vaesenc ymm0, ymm0, ymm13
cmp eax, 11
- vbroadcasti128 ymm13, [r9+160]
+ vbroadcasti128 ymm13, OWORD PTR [r9+160]
jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+176]
+ vbroadcasti128 ymm13, OWORD PTR [r9+176]
vaesenc ymm0, ymm0, ymm13
cmp eax, 13
- vbroadcasti128 ymm13, [r9+192]
+ vbroadcasti128 ymm13, OWORD PTR [r9+192]
jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+208]
+ vbroadcasti128 ymm13, OWORD PTR [r9+208]
vaesenc ymm0, ymm0, ymm13
- vbroadcasti128 ymm13, [r9+224]
+ vbroadcasti128 ymm13, OWORD PTR [r9+224]
L_AES_CTR_encrypt_vaes_32_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm13
- vpxor ymm0, ymm0, [r11]
- vmovdqu YMMWORD PTR [rbx], ymm0
- add eax, 32
- cmp eax, r10d
+ vpxor ymm0, ymm0, [r13]
+ vmovdqu YMMWORD PTR [r14], ymm0
+ add r11d, 32
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_vaes_enc_32
L_AES_CTR_encrypt_vaes_done_32:
- cmp eax, r8d
- mov r10d, r8d
+ cmp r11d, r8d
+ mov r12d, r8d
je L_AES_CTR_encrypt_vaes_done_enc
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CTR_encrypt_vaes_enc_16:
; 16 bytes of input
vpshufb xmm0, xmm7, xmm8
@@ -3146,12 +3198,12 @@ L_AES_CTR_encrypt_vaes_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_CTR_encrypt_vaes_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r11, QWORD PTR [rcx+rax]
- vpxor xmm0, xmm0, [r11]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rcx+r11]
+ vpxor xmm0, xmm0, [r13]
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_vaes_enc_16
L_AES_CTR_encrypt_vaes_done_enc:
vpshufb xmm0, xmm7, xmm8
@@ -3166,7 +3218,9 @@ L_AES_CTR_encrypt_vaes_done_enc:
vmovdqu xmm13, OWORD PTR [rsp+112]
vmovdqu xmm14, OWORD PTR [rsp+128]
add rsp, 144
- pop rbx
+ pop r14
+ pop r13
+ pop r12
ret
AES_CTR_encrypt_vaes ENDP
_TEXT ENDS
@@ -3174,7 +3228,9 @@ ENDIF
IFDEF HAVE_INTEL_AVX512
_TEXT SEGMENT READONLY PARA
AES_ECB_encrypt_avx512 PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 160
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -3186,42 +3242,42 @@ AES_ECB_encrypt_avx512 PROC
vmovdqu OWORD PTR [rsp+112], xmm13
vmovdqu OWORD PTR [rsp+128], xmm14
vmovdqu OWORD PTR [rsp+144], xmm15
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
jl L_AES_ECB_encrypt_avx512_done_64
- vbroadcasti32x4 zmm8, [r9]
- vbroadcasti32x4 zmm9, [r9+16]
- vbroadcasti32x4 zmm10, [r9+32]
- vbroadcasti32x4 zmm11, [r9+48]
- vbroadcasti32x4 zmm12, [r9+64]
- vbroadcasti32x4 zmm13, [r9+80]
- vbroadcasti32x4 zmm14, [r9+96]
- vbroadcasti32x4 zmm15, [r9+112]
- vbroadcasti32x4 zmm16, [r9+128]
- vbroadcasti32x4 zmm17, [r9+144]
- vbroadcasti32x4 zmm18, [r9+160]
+ vbroadcasti32x4 zmm8, OWORD PTR [r9]
+ vbroadcasti32x4 zmm9, OWORD PTR [r9+16]
+ vbroadcasti32x4 zmm10, OWORD PTR [r9+32]
+ vbroadcasti32x4 zmm11, OWORD PTR [r9+48]
+ vbroadcasti32x4 zmm12, OWORD PTR [r9+64]
+ vbroadcasti32x4 zmm13, OWORD PTR [r9+80]
+ vbroadcasti32x4 zmm14, OWORD PTR [r9+96]
+ vbroadcasti32x4 zmm15, OWORD PTR [r9+112]
+ vbroadcasti32x4 zmm16, OWORD PTR [r9+128]
+ vbroadcasti32x4 zmm17, OWORD PTR [r9+144]
+ vbroadcasti32x4 zmm18, OWORD PTR [r9+160]
cmp eax, 11
jl L_AES_ECB_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm19, [r9+176]
- vbroadcasti32x4 zmm20, [r9+192]
+ vbroadcasti32x4 zmm19, OWORD PTR [r9+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r9+192]
cmp eax, 13
jl L_AES_ECB_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm21, [r9+208]
- vbroadcasti32x4 zmm22, [r9+224]
+ vbroadcasti32x4 zmm21, OWORD PTR [r9+208]
+ vbroadcasti32x4 zmm22, OWORD PTR [r9+224]
L_AES_ECB_encrypt_avx512_key_cached:
cmp r8d, 256
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_encrypt_avx512_done_256
- and r9d, 4294967040
+ and r11d, 4294967040
L_AES_ECB_encrypt_avx512_enc_256:
; 256 bytes of input
; aes_ecb_enc_256
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r10]
- vmovdqu64 zmm1, [r10+64]
- vmovdqu64 zmm2, [r10+128]
- vmovdqu64 zmm3, [r10+192]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu64 zmm0, [r12]
+ vmovdqu64 zmm1, [r12+64]
+ vmovdqu64 zmm2, [r12+128]
+ vmovdqu64 zmm3, [r12+192]
; aes_enc_block
vpxorq zmm0, zmm0, zmm8
vpxorq zmm1, zmm1, zmm8
@@ -3291,24 +3347,24 @@ L_AES_ECB_encrypt_avx512_256_aes_enc_block_last:
vaesenclast zmm1, zmm1, zmm7
vaesenclast zmm2, zmm2, zmm7
vaesenclast zmm3, zmm3, zmm7
- vmovdqu64 [r11], zmm0
- vmovdqu64 [r11+64], zmm1
- vmovdqu64 [r11+128], zmm2
- vmovdqu64 [r11+192], zmm3
- add eax, 256
- cmp eax, r9d
+ vmovdqu64 [r13], zmm0
+ vmovdqu64 [r13+64], zmm1
+ vmovdqu64 [r13+128], zmm2
+ vmovdqu64 [r13+192], zmm3
+ add r10d, 256
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_avx512_enc_256
L_AES_ECB_encrypt_avx512_done_256:
- mov r9d, r8d
- and r9d, 4294967232
- cmp eax, r9d
+ mov r11d, r8d
+ and r11d, 4294967232
+ cmp r10d, r11d
je L_AES_ECB_encrypt_avx512_done_64
L_AES_ECB_encrypt_avx512_enc_64:
; 64 bytes of input
; aes_ecb_enc_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu64 zmm0, [r12]
; aes_enc_block
vpxorq zmm0, zmm0, zmm8
vaesenc zmm0, zmm0, zmm9
@@ -3333,19 +3389,19 @@ L_AES_ECB_encrypt_avx512_enc_64:
vmovdqa64 zmm7, zmm22
L_AES_ECB_encrypt_avx512_64_aes_enc_block_last:
vaesenclast zmm0, zmm0, zmm7
- vmovdqu64 [r11], zmm0
- add eax, 64
- cmp eax, r9d
+ vmovdqu64 [r13], zmm0
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_avx512_enc_64
L_AES_ECB_encrypt_avx512_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_encrypt_avx512_done_enc
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_encrypt_avx512_enc_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_enc_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -3381,10 +3437,10 @@ L_AES_ECB_encrypt_avx512_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_encrypt_avx512_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_encrypt_avx512_enc_16
L_AES_ECB_encrypt_avx512_done_enc:
vmovdqu xmm6, OWORD PTR [rsp]
@@ -3398,12 +3454,16 @@ L_AES_ECB_encrypt_avx512_done_enc:
vmovdqu xmm14, OWORD PTR [rsp+128]
vmovdqu xmm15, OWORD PTR [rsp+144]
add rsp, 160
+ pop r13
+ pop r12
ret
AES_ECB_encrypt_avx512 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_ECB_decrypt_avx512 PROC
- mov eax, DWORD PTR [rsp+40]
+ push r12
+ push r13
+ mov eax, DWORD PTR [rsp+56]
sub rsp, 160
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -3415,42 +3475,42 @@ AES_ECB_decrypt_avx512 PROC
vmovdqu OWORD PTR [rsp+112], xmm13
vmovdqu OWORD PTR [rsp+128], xmm14
vmovdqu OWORD PTR [rsp+144], xmm15
- xor eax, eax
+ xor r10d, r10d
cmp r8d, 64
jl L_AES_ECB_decrypt_avx512_done_64
- vbroadcasti32x4 zmm8, [r9]
- vbroadcasti32x4 zmm9, [r9+16]
- vbroadcasti32x4 zmm10, [r9+32]
- vbroadcasti32x4 zmm11, [r9+48]
- vbroadcasti32x4 zmm12, [r9+64]
- vbroadcasti32x4 zmm13, [r9+80]
- vbroadcasti32x4 zmm14, [r9+96]
- vbroadcasti32x4 zmm15, [r9+112]
- vbroadcasti32x4 zmm16, [r9+128]
- vbroadcasti32x4 zmm17, [r9+144]
- vbroadcasti32x4 zmm18, [r9+160]
+ vbroadcasti32x4 zmm8, OWORD PTR [r9]
+ vbroadcasti32x4 zmm9, OWORD PTR [r9+16]
+ vbroadcasti32x4 zmm10, OWORD PTR [r9+32]
+ vbroadcasti32x4 zmm11, OWORD PTR [r9+48]
+ vbroadcasti32x4 zmm12, OWORD PTR [r9+64]
+ vbroadcasti32x4 zmm13, OWORD PTR [r9+80]
+ vbroadcasti32x4 zmm14, OWORD PTR [r9+96]
+ vbroadcasti32x4 zmm15, OWORD PTR [r9+112]
+ vbroadcasti32x4 zmm16, OWORD PTR [r9+128]
+ vbroadcasti32x4 zmm17, OWORD PTR [r9+144]
+ vbroadcasti32x4 zmm18, OWORD PTR [r9+160]
cmp eax, 11
jl L_AES_ECB_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm19, [r9+176]
- vbroadcasti32x4 zmm20, [r9+192]
+ vbroadcasti32x4 zmm19, OWORD PTR [r9+176]
+ vbroadcasti32x4 zmm20, OWORD PTR [r9+192]
cmp eax, 13
jl L_AES_ECB_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm21, [r9+208]
- vbroadcasti32x4 zmm22, [r9+224]
+ vbroadcasti32x4 zmm21, OWORD PTR [r9+208]
+ vbroadcasti32x4 zmm22, OWORD PTR [r9+224]
L_AES_ECB_decrypt_avx512_key_cached:
cmp r8d, 256
- mov r9d, r8d
+ mov r11d, r8d
jl L_AES_ECB_decrypt_avx512_done_256
- and r9d, 4294967040
+ and r11d, 4294967040
L_AES_ECB_decrypt_avx512_dec_256:
; 256 bytes of input
; aes_ecb_dec_256
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r10]
- vmovdqu64 zmm1, [r10+64]
- vmovdqu64 zmm2, [r10+128]
- vmovdqu64 zmm3, [r10+192]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu64 zmm0, [r12]
+ vmovdqu64 zmm1, [r12+64]
+ vmovdqu64 zmm2, [r12+128]
+ vmovdqu64 zmm3, [r12+192]
; aes_dec_block
vpxorq zmm0, zmm0, zmm8
vpxorq zmm1, zmm1, zmm8
@@ -3520,24 +3580,24 @@ L_AES_ECB_decrypt_avx512_256_aes_dec_block_last:
vaesdeclast zmm1, zmm1, zmm7
vaesdeclast zmm2, zmm2, zmm7
vaesdeclast zmm3, zmm3, zmm7
- vmovdqu64 [r11], zmm0
- vmovdqu64 [r11+64], zmm1
- vmovdqu64 [r11+128], zmm2
- vmovdqu64 [r11+192], zmm3
- add eax, 256
- cmp eax, r9d
+ vmovdqu64 [r13], zmm0
+ vmovdqu64 [r13+64], zmm1
+ vmovdqu64 [r13+128], zmm2
+ vmovdqu64 [r13+192], zmm3
+ add r10d, 256
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_avx512_dec_256
L_AES_ECB_decrypt_avx512_done_256:
- mov r9d, r8d
- and r9d, 4294967232
- cmp eax, r9d
+ mov r11d, r8d
+ and r11d, 4294967232
+ cmp r10d, r11d
je L_AES_ECB_decrypt_avx512_done_64
L_AES_ECB_decrypt_avx512_dec_64:
; 64 bytes of input
; aes_ecb_dec_64
- lea r10, QWORD PTR [rcx+rax]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ lea r13, QWORD PTR [rdx+r10]
+ vmovdqu64 zmm0, [r12]
; aes_dec_block
vpxorq zmm0, zmm0, zmm8
vaesdec zmm0, zmm0, zmm9
@@ -3562,19 +3622,19 @@ L_AES_ECB_decrypt_avx512_dec_64:
vmovdqa64 zmm7, zmm22
L_AES_ECB_decrypt_avx512_64_aes_dec_block_last:
vaesdeclast zmm0, zmm0, zmm7
- vmovdqu64 [r11], zmm0
- add eax, 64
- cmp eax, r9d
+ vmovdqu64 [r13], zmm0
+ add r10d, 64
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_avx512_dec_64
L_AES_ECB_decrypt_avx512_done_64:
- cmp eax, r8d
- mov r9d, r8d
+ cmp r10d, r8d
+ mov r11d, r8d
je L_AES_ECB_decrypt_avx512_done_dec
- and r9d, 4294967280
+ and r11d, 4294967280
L_AES_ECB_decrypt_avx512_dec_16:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r10]
+ vmovdqu xmm0, OWORD PTR [r12]
; aes_dec_block
vpxor xmm0, xmm0, [r9]
vmovdqu xmm5, OWORD PTR [r9+16]
@@ -3610,10 +3670,10 @@ L_AES_ECB_decrypt_avx512_dec_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_ECB_decrypt_avx512_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm5
- lea r10, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r10], xmm0
- add eax, 16
- cmp eax, r9d
+ lea r12, QWORD PTR [rdx+r10]
+ vmovdqu OWORD PTR [r12], xmm0
+ add r10d, 16
+ cmp r10d, r11d
jl L_AES_ECB_decrypt_avx512_dec_16
L_AES_ECB_decrypt_avx512_done_dec:
vmovdqu xmm6, OWORD PTR [rsp]
@@ -3627,21 +3687,25 @@ L_AES_ECB_decrypt_avx512_done_dec:
vmovdqu xmm14, OWORD PTR [rsp+128]
vmovdqu xmm15, OWORD PTR [rsp+144]
add rsp, 160
+ pop r13
+ pop r12
ret
AES_ECB_decrypt_avx512 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_encrypt_avx512 PROC
- mov rax, QWORD PTR [rsp+40]
- mov r10d, DWORD PTR [rsp+48]
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ mov r10d, DWORD PTR [rsp+64]
vmovdqu xmm0, OWORD PTR [r8]
- xor eax, eax
- cmp eax, r9d
+ xor r11d, r11d
+ cmp r11d, r9d
je L_AES_CBC_encrypt_avx512_done
L_AES_CBC_encrypt_avx512_loop:
; 16 bytes of input
- lea r10, QWORD PTR [rcx+rax]
- vmovdqu xmm1, OWORD PTR [r10]
+ lea r12, QWORD PTR [rcx+r11]
+ vmovdqu xmm1, OWORD PTR [r12]
vpternlogq xmm1, xmm0, [rax], 150
; aes_enc_block
vmovdqu xmm3, OWORD PTR [rax+16]
@@ -3677,22 +3741,26 @@ L_AES_CBC_encrypt_avx512_loop:
vmovdqu xmm3, OWORD PTR [rax+224]
L_AES_CBC_encrypt_avx512_aes_enc_block_last:
vaesenclast xmm1, xmm1, xmm3
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm1
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm1
vmovdqa xmm0, xmm1
- add eax, 16
- cmp eax, r9d
+ add r11d, 16
+ cmp r11d, r9d
jl L_AES_CBC_encrypt_avx512_loop
L_AES_CBC_encrypt_avx512_done:
vmovdqu OWORD PTR [r8], xmm0
+ pop r13
+ pop r12
ret
AES_CBC_encrypt_avx512 ENDP
_TEXT ENDS
_TEXT SEGMENT READONLY PARA
AES_CBC_decrypt_avx512 PROC
push r12
- mov rax, QWORD PTR [rsp+48]
- mov r10d, DWORD PTR [rsp+56]
+ push r13
+ push r14
+ mov rax, QWORD PTR [rsp+64]
+ mov r10d, DWORD PTR [rsp+72]
sub rsp, 160
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -3705,47 +3773,47 @@ AES_CBC_decrypt_avx512 PROC
vmovdqu OWORD PTR [rsp+128], xmm14
vmovdqu OWORD PTR [rsp+144], xmm15
vmovdqu xmm8, OWORD PTR [r8]
- xor eax, eax
+ xor r11d, r11d
cmp r9d, 64
jl L_AES_CBC_decrypt_avx512_done_64
- vbroadcasti32x4 zmm14, [rax]
- vbroadcasti32x4 zmm15, [rax+16]
- vbroadcasti32x4 zmm16, [rax+32]
- vbroadcasti32x4 zmm17, [rax+48]
- vbroadcasti32x4 zmm18, [rax+64]
- vbroadcasti32x4 zmm19, [rax+80]
- vbroadcasti32x4 zmm20, [rax+96]
- vbroadcasti32x4 zmm21, [rax+112]
- vbroadcasti32x4 zmm22, [rax+128]
- vbroadcasti32x4 zmm23, [rax+144]
- vbroadcasti32x4 zmm24, [rax+160]
+ vbroadcasti32x4 zmm14, OWORD PTR [rax]
+ vbroadcasti32x4 zmm15, OWORD PTR [rax+16]
+ vbroadcasti32x4 zmm16, OWORD PTR [rax+32]
+ vbroadcasti32x4 zmm17, OWORD PTR [rax+48]
+ vbroadcasti32x4 zmm18, OWORD PTR [rax+64]
+ vbroadcasti32x4 zmm19, OWORD PTR [rax+80]
+ vbroadcasti32x4 zmm20, OWORD PTR [rax+96]
+ vbroadcasti32x4 zmm21, OWORD PTR [rax+112]
+ vbroadcasti32x4 zmm22, OWORD PTR [rax+128]
+ vbroadcasti32x4 zmm23, OWORD PTR [rax+144]
+ vbroadcasti32x4 zmm24, OWORD PTR [rax+160]
cmp r10d, 11
jl L_AES_CBC_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm25, [rax+176]
- vbroadcasti32x4 zmm26, [rax+192]
+ vbroadcasti32x4 zmm25, OWORD PTR [rax+176]
+ vbroadcasti32x4 zmm26, OWORD PTR [rax+192]
cmp r10d, 13
jl L_AES_CBC_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm27, [rax+208]
- vbroadcasti32x4 zmm28, [rax+224]
+ vbroadcasti32x4 zmm27, OWORD PTR [rax+208]
+ vbroadcasti32x4 zmm28, OWORD PTR [rax+224]
L_AES_CBC_decrypt_avx512_key_cached:
cmp r9d, 256
- mov r10d, r9d
+ mov r12d, r9d
jl L_AES_CBC_decrypt_avx512_done_256
- and r10d, 4294967040
+ and r12d, 4294967040
L_AES_CBC_decrypt_avx512_dec_256:
; 256 bytes of input
; aes_cbc_dec_256
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r11]
- vmovdqu64 zmm1, [r11+64]
- vmovdqu64 zmm2, [r11+128]
- vmovdqu64 zmm3, [r11+192]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ vmovdqu64 zmm0, [r13]
+ vmovdqu64 zmm1, [r13+64]
+ vmovdqu64 zmm2, [r13+128]
+ vmovdqu64 zmm3, [r13+192]
vshufi64x2 zmm10, zmm0, zmm0, 144
vinserti32x4 zmm10, zmm10, xmm8, 0
- vmovdqu64 zmm11, [r11+48]
- vmovdqu64 zmm12, [r11+112]
- vmovdqu64 zmm13, [r11+176]
+ vmovdqu64 zmm11, [r13+48]
+ vmovdqu64 zmm12, [r13+112]
+ vmovdqu64 zmm13, [r13+176]
vextracti32x4 xmm8, zmm3, 3
; aes_dec_block
vpxorq zmm0, zmm0, zmm14
@@ -3820,24 +3888,24 @@ L_AES_CBC_decrypt_avx512_256_aes_dec_block_last:
vpxorq zmm1, zmm1, zmm11
vpxorq zmm2, zmm2, zmm12
vpxorq zmm3, zmm3, zmm13
- vmovdqu64 [r12], zmm0
- vmovdqu64 [r12+64], zmm1
- vmovdqu64 [r12+128], zmm2
- vmovdqu64 [r12+192], zmm3
- add eax, 256
- cmp eax, r10d
+ vmovdqu64 [r14], zmm0
+ vmovdqu64 [r14+64], zmm1
+ vmovdqu64 [r14+128], zmm2
+ vmovdqu64 [r14+192], zmm3
+ add r11d, 256
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_avx512_dec_256
L_AES_CBC_decrypt_avx512_done_256:
- mov r10d, r9d
- and r10d, 4294967232
- cmp eax, r10d
+ mov r12d, r9d
+ and r12d, 4294967232
+ cmp r11d, r12d
je L_AES_CBC_decrypt_avx512_done_64
L_AES_CBC_decrypt_avx512_dec_64:
; 64 bytes of input
; aes_cbc_dec_64
- lea r11, QWORD PTR [rcx+rax]
- lea r12, QWORD PTR [rdx+rax]
- vmovdqu64 zmm0, [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
+ vmovdqu64 zmm0, [r13]
vshufi64x2 zmm10, zmm0, zmm0, 144
vinserti32x4 zmm10, zmm10, xmm8, 0
vextracti32x4 xmm8, zmm0, 3
@@ -3866,19 +3934,19 @@ L_AES_CBC_decrypt_avx512_dec_64:
L_AES_CBC_decrypt_avx512_64_aes_dec_block_last:
vaesdeclast zmm0, zmm0, zmm9
vpxorq zmm0, zmm0, zmm10
- vmovdqu64 [r12], zmm0
- add eax, 64
- cmp eax, r10d
+ vmovdqu64 [r14], zmm0
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_avx512_dec_64
L_AES_CBC_decrypt_avx512_done_64:
- cmp eax, r9d
- mov r10d, r9d
+ cmp r11d, r9d
+ mov r12d, r9d
je L_AES_CBC_decrypt_avx512_done_dec
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CBC_decrypt_avx512_dec_16:
; 16 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- vmovdqu xmm0, OWORD PTR [r11]
+ lea r13, QWORD PTR [rcx+r11]
+ vmovdqu xmm0, OWORD PTR [r13]
vmovdqa xmm7, xmm0
; aes_dec_block
vpxor xmm0, xmm0, [rax]
@@ -3917,10 +3985,10 @@ L_AES_CBC_decrypt_avx512_16_aes_dec_block_last:
vaesdeclast xmm0, xmm0, xmm5
vpxor xmm0, xmm0, xmm8
vmovdqa xmm8, xmm7
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CBC_decrypt_avx512_dec_16
L_AES_CBC_decrypt_avx512_done_dec:
vmovdqu OWORD PTR [r8], xmm8
@@ -3935,43 +4003,45 @@ L_AES_CBC_decrypt_avx512_done_dec:
vmovdqu xmm14, OWORD PTR [rsp+128]
vmovdqu xmm15, OWORD PTR [rsp+144]
add rsp, 160
+ pop r14
+ pop r13
pop r12
ret
AES_CBC_decrypt_avx512 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_bswap_avx512 QWORD \
- 08090a0b0c0d0e0fh, 0001020304050607h
+L_aes_ctr_bswap_avx512 QWORD 08090a0b0c0d0e0fh, 0001020304050607h
ptr_L_aes_ctr_bswap_avx512 QWORD L_aes_ctr_bswap_avx512
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_ctr_inc_avx512 QWORD \
- 0000000000000000h, 0000000000000000h,
- 0000000000000001h, 0000000000000000h,
- 0000000000000002h, 0000000000000000h,
- 0000000000000003h, 0000000000000000h,
- 0000000000000004h, 0000000000000000h,
- 0000000000000005h, 0000000000000000h,
- 0000000000000006h, 0000000000000000h,
- 0000000000000007h, 0000000000000000h,
- 0000000000000008h, 0000000000000000h,
- 0000000000000009h, 0000000000000000h,
- 000000000000000ah, 0000000000000000h,
- 000000000000000bh, 0000000000000000h,
- 000000000000000ch, 0000000000000000h,
- 000000000000000dh, 0000000000000000h,
- 000000000000000eh, 0000000000000000h,
- 000000000000000fh, 0000000000000000h,
- 0000000000000010h, 0000000000000000h
+L_aes_ctr_inc_avx512 QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000001h, 0000000000000000h
+ QWORD 0000000000000002h, 0000000000000000h
+ QWORD 0000000000000003h, 0000000000000000h
+ QWORD 0000000000000004h, 0000000000000000h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000000000008h, 0000000000000000h
+ QWORD 0000000000000009h, 0000000000000000h
+ QWORD 000000000000000ah, 0000000000000000h
+ QWORD 000000000000000bh, 0000000000000000h
+ QWORD 000000000000000ch, 0000000000000000h
+ QWORD 000000000000000dh, 0000000000000000h
+ QWORD 000000000000000eh, 0000000000000000h
+ QWORD 000000000000000fh, 0000000000000000h
+ QWORD 0000000000000010h, 0000000000000000h
ptr_L_aes_ctr_inc_avx512 QWORD L_aes_ctr_inc_avx512
_DATA ENDS
_TEXT SEGMENT READONLY PARA
AES_CTR_encrypt_avx512 PROC
- push rbx
- mov eax, DWORD PTR [rsp+48]
- mov r10, QWORD PTR [rsp+56]
+ push r12
+ push r13
+ push r14
+ mov eax, DWORD PTR [rsp+64]
+ mov r10, QWORD PTR [rsp+72]
sub rsp, 160
vmovdqu OWORD PTR [rsp], xmm6
vmovdqu OWORD PTR [rsp+16], xmm7
@@ -3984,38 +4054,38 @@ AES_CTR_encrypt_avx512 PROC
vmovdqu OWORD PTR [rsp+128], xmm14
vmovdqu OWORD PTR [rsp+144], xmm15
vbroadcasti32x4 zmm8, ptr_L_aes_ctr_bswap_avx512
- vbroadcasti32x4 zmm7, [r10]
+ vbroadcasti32x4 zmm7, OWORD PTR [r10]
vpshufb zmm7, zmm7, zmm8
- vbroadcasti32x4 zmm10, [ptr_L_aes_ctr_inc_avx512+256]
- vbroadcasti32x4 zmm11, [ptr_L_aes_ctr_inc_avx512+64]
- vbroadcasti32x4 zmm12, [ptr_L_aes_ctr_inc_avx512+16]
- xor eax, eax
+ vbroadcasti32x4 zmm10, OWORD PTR [ptr_L_aes_ctr_inc_avx512+256]
+ vbroadcasti32x4 zmm11, OWORD PTR [ptr_L_aes_ctr_inc_avx512+64]
+ vbroadcasti32x4 zmm12, OWORD PTR [ptr_L_aes_ctr_inc_avx512+16]
+ xor r11d, r11d
cmp r8d, 64
jl L_AES_CTR_encrypt_avx512_done_64
- vbroadcasti32x4 zmm14, [r9]
- vbroadcasti32x4 zmm15, [r9+16]
- vbroadcasti32x4 zmm16, [r9+32]
- vbroadcasti32x4 zmm17, [r9+48]
- vbroadcasti32x4 zmm18, [r9+64]
- vbroadcasti32x4 zmm19, [r9+80]
- vbroadcasti32x4 zmm20, [r9+96]
- vbroadcasti32x4 zmm21, [r9+112]
- vbroadcasti32x4 zmm22, [r9+128]
- vbroadcasti32x4 zmm23, [r9+144]
- vbroadcasti32x4 zmm24, [r9+160]
+ vbroadcasti32x4 zmm14, OWORD PTR [r9]
+ vbroadcasti32x4 zmm15, OWORD PTR [r9+16]
+ vbroadcasti32x4 zmm16, OWORD PTR [r9+32]
+ vbroadcasti32x4 zmm17, OWORD PTR [r9+48]
+ vbroadcasti32x4 zmm18, OWORD PTR [r9+64]
+ vbroadcasti32x4 zmm19, OWORD PTR [r9+80]
+ vbroadcasti32x4 zmm20, OWORD PTR [r9+96]
+ vbroadcasti32x4 zmm21, OWORD PTR [r9+112]
+ vbroadcasti32x4 zmm22, OWORD PTR [r9+128]
+ vbroadcasti32x4 zmm23, OWORD PTR [r9+144]
+ vbroadcasti32x4 zmm24, OWORD PTR [r9+160]
cmp eax, 11
jl L_AES_CTR_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm25, [r9+176]
- vbroadcasti32x4 zmm26, [r9+192]
+ vbroadcasti32x4 zmm25, OWORD PTR [r9+176]
+ vbroadcasti32x4 zmm26, OWORD PTR [r9+192]
cmp eax, 13
jl L_AES_CTR_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm27, [r9+208]
- vbroadcasti32x4 zmm28, [r9+224]
+ vbroadcasti32x4 zmm27, OWORD PTR [r9+208]
+ vbroadcasti32x4 zmm28, OWORD PTR [r9+224]
L_AES_CTR_encrypt_avx512_key_cached:
cmp r8d, 256
- mov r10d, r8d
+ mov r12d, r8d
jl L_AES_CTR_encrypt_avx512_done_256
- and r10d, 4294967040
+ and r12d, 4294967040
vmovdqa64 zmm9, zmm7
vpaddq zmm4, zmm7, [ptr_L_aes_ctr_inc_avx512]
vpternlogq zmm9, zmm4, [ptr_L_aes_ctr_inc_avx512], 178
@@ -4042,8 +4112,8 @@ L_AES_CTR_encrypt_avx512_key_cached:
vpaddq zmm7, zmm7, zmm9
L_AES_CTR_encrypt_avx512_enc_256:
; 256 bytes of input
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
vpshufb zmm0, zmm4, zmm8
vpshufb zmm1, zmm5, zmm8
vpshufb zmm2, zmm6, zmm8
@@ -4141,28 +4211,28 @@ L_AES_CTR_encrypt_avx512_256_aes_enc_block_last:
vaesenclast zmm1, zmm1, zmm13
vaesenclast zmm2, zmm2, zmm13
vaesenclast zmm3, zmm3, zmm13
- vpxorq zmm0, zmm0, [r11]
- vpxorq zmm1, zmm1, [r11+64]
- vpxorq zmm2, zmm2, [r11+128]
- vpxorq zmm3, zmm3, [r11+192]
- vmovdqu64 [rbx], zmm0
- vmovdqu64 [rbx+64], zmm1
- vmovdqu64 [rbx+128], zmm2
- vmovdqu64 [rbx+192], zmm3
- add eax, 256
- cmp eax, r10d
+ vpxorq zmm0, zmm0, [r13]
+ vpxorq zmm1, zmm1, [r13+64]
+ vpxorq zmm2, zmm2, [r13+128]
+ vpxorq zmm3, zmm3, [r13+192]
+ vmovdqu64 [r14], zmm0
+ vmovdqu64 [r14+64], zmm1
+ vmovdqu64 [r14+128], zmm2
+ vmovdqu64 [r14+192], zmm3
+ add r11d, 256
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_avx512_enc_256
vshufi64x2 zmm7, zmm4, zmm4, 0
L_AES_CTR_encrypt_avx512_done_256:
- mov r10d, r8d
- and r10d, 4294967232
- cmp eax, r10d
+ mov r12d, r8d
+ and r12d, 4294967232
+ cmp r11d, r12d
je L_AES_CTR_encrypt_avx512_done_64
L_AES_CTR_encrypt_avx512_enc_64:
; 64 bytes of input
; aes_ctr_enc_64
- lea r11, QWORD PTR [rcx+rax]
- lea rbx, QWORD PTR [rdx+rax]
+ lea r13, QWORD PTR [rcx+r11]
+ lea r14, QWORD PTR [rdx+r11]
vpaddq zmm0, zmm7, [ptr_L_aes_ctr_inc_avx512]
vmovdqa64 zmm9, zmm7
vpternlogq zmm9, zmm0, [ptr_L_aes_ctr_inc_avx512], 178
@@ -4200,16 +4270,16 @@ L_AES_CTR_encrypt_avx512_enc_64:
vmovdqa64 zmm13, zmm28
L_AES_CTR_encrypt_avx512_64_aes_enc_block_last:
vaesenclast zmm0, zmm0, zmm13
- vpxorq zmm0, zmm0, [r11]
- vmovdqu64 [rbx], zmm0
- add eax, 64
- cmp eax, r10d
+ vpxorq zmm0, zmm0, [r13]
+ vmovdqu64 [r14], zmm0
+ add r11d, 64
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_avx512_enc_64
L_AES_CTR_encrypt_avx512_done_64:
- cmp eax, r8d
- mov r10d, r8d
+ cmp r11d, r8d
+ mov r12d, r8d
je L_AES_CTR_encrypt_avx512_done_enc
- and r10d, 4294967280
+ and r12d, 4294967280
L_AES_CTR_encrypt_avx512_enc_16:
; 16 bytes of input
vpshufb xmm0, xmm7, xmm8
@@ -4254,12 +4324,12 @@ L_AES_CTR_encrypt_avx512_enc_16:
vmovdqu xmm5, OWORD PTR [r9+224]
L_AES_CTR_encrypt_avx512_16_aes_enc_block_last:
vaesenclast xmm0, xmm0, xmm5
- lea r11, QWORD PTR [rcx+rax]
- vpxor xmm0, xmm0, [r11]
- lea r11, QWORD PTR [rdx+rax]
- vmovdqu OWORD PTR [r11], xmm0
- add eax, 16
- cmp eax, r10d
+ lea r13, QWORD PTR [rcx+r11]
+ vpxor xmm0, xmm0, [r13]
+ lea r13, QWORD PTR [rdx+r11]
+ vmovdqu OWORD PTR [r13], xmm0
+ add r11d, 16
+ cmp r11d, r12d
jl L_AES_CTR_encrypt_avx512_enc_16
L_AES_CTR_encrypt_avx512_done_enc:
vpshufb xmm0, xmm7, xmm8
@@ -4275,7 +4345,9 @@ L_AES_CTR_encrypt_avx512_done_enc:
vmovdqu xmm14, OWORD PTR [rsp+128]
vmovdqu xmm15, OWORD PTR [rsp+144]
add rsp, 160
- pop rbx
+ pop r14
+ pop r13
+ pop r12
ret
AES_CTR_encrypt_avx512 ENDP
_TEXT ENDS
diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm
index a904ffa4ce7..d11e836fa0e 100644
--- a/wolfcrypt/src/aes_xts_asm.asm
+++ b/wolfcrypt/src/aes_xts_asm.asm
@@ -85,8 +85,7 @@ AES_XTS_init_aesni ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_aes_xts_gc_xts DWORD \
- 00000087h, 00000001h, 00000001h, 00000001h
+L_aes_xts_gc_xts DWORD 00000087h, 00000001h, 00000001h, 00000001h
ptr_L_aes_xts_gc_xts QWORD L_aes_xts_gc_xts
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -1500,8 +1499,7 @@ AES_XTS_init_avx1 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_avx1_aes_xts_gc_xts DWORD \
- 00000087h, 00000001h, 00000001h, 00000001h
+L_avx1_aes_xts_gc_xts DWORD 00000087h, 00000001h, 00000001h, 00000001h
ptr_L_avx1_aes_xts_gc_xts QWORD L_avx1_aes_xts_gc_xts
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -2876,28 +2874,24 @@ AES_XTS_init_vaes ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_xts_gc_xts DWORD \
- 00000087h, 00000000h, 00000001h, 00000000h
+L_vaes_aes_xts_gc_xts DWORD 00000087h, 00000000h, 00000001h, 00000000h
ptr_L_vaes_aes_xts_gc_xts QWORD L_vaes_aes_xts_gc_xts
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_xts_poly DWORD \
- 00000087h, 00000000h, 00000000h, 00000000h
+L_vaes_aes_xts_poly DWORD 00000087h, 00000000h, 00000000h, 00000000h
ptr_L_vaes_aes_xts_poly QWORD L_vaes_aes_xts_poly
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_xts_shl DWORD \
- 00000000h, 00000000h, 00000000h, 00000000h,
- 00000001h, 00000000h, 00000001h, 00000000h
+L_vaes_aes_xts_shl DWORD 00000000h, 00000000h, 00000000h, 00000000h
+ DWORD 00000001h, 00000000h, 00000001h, 00000000h
ptr_L_vaes_aes_xts_shl QWORD L_vaes_aes_xts_shl
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_vaes_aes_xts_shr DWORD \
- 00000040h, 00000000h, 00000040h, 00000000h,
- 0000003fh, 00000000h, 0000003fh, 00000000h
+L_vaes_aes_xts_shr DWORD 00000040h, 00000000h, 00000040h, 00000000h
+ DWORD 0000003fh, 00000000h, 0000003fh, 00000000h
ptr_L_vaes_aes_xts_shr QWORD L_vaes_aes_xts_shr
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -3006,7 +3000,7 @@ L_AES_XTS_encrypt_vaes_enc_128:
vmovdqu ymm2, YMMWORD PTR [rcx+64]
vmovdqu ymm3, YMMWORD PTR [rcx+96]
; aes_enc_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
@@ -3015,76 +3009,76 @@ L_AES_XTS_encrypt_vaes_enc_128:
vpxor ymm2, ymm2, ymm9
vpxor ymm3, ymm3, ymm7
vpxor ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vaesenclast ymm1, ymm1, ymm9
@@ -3151,55 +3145,55 @@ L_AES_XTS_encrypt_vaes_done_128:
vpxor ymm5, ymm5, ymm10
vpxor ymm5, ymm5, ymm9
; aes_enc_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
vpxor ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vaesenclast ymm1, ymm1, ymm9
@@ -3232,40 +3226,40 @@ L_AES_XTS_encrypt_vaes_done_64:
vpxor ymm4, ymm4, ymm7
vpxor ymm4, ymm4, ymm6
; aes_enc_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesenc ymm0, ymm0, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesenc ymm0, ymm0, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vpxor ymm0, ymm0, ymm4
@@ -3485,7 +3479,7 @@ L_AES_XTS_encrypt_update_vaes_enc_128:
vmovdqu ymm2, YMMWORD PTR [rcx+64]
vmovdqu ymm3, YMMWORD PTR [rcx+96]
; aes_enc_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
@@ -3494,76 +3488,76 @@ L_AES_XTS_encrypt_update_vaes_enc_128:
vpxor ymm2, ymm2, ymm9
vpxor ymm3, ymm3, ymm7
vpxor ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
vaesenc ymm2, ymm2, ymm9
vaesenc ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vaesenclast ymm1, ymm1, ymm9
@@ -3630,55 +3624,55 @@ L_AES_XTS_encrypt_update_vaes_done_128:
vpxor ymm5, ymm5, ymm10
vpxor ymm5, ymm5, ymm9
; aes_enc_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
vpxor ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesenc ymm0, ymm0, ymm9
vaesenc ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vaesenclast ymm1, ymm1, ymm9
@@ -3711,40 +3705,40 @@ L_AES_XTS_encrypt_update_vaes_done_64:
vpxor ymm4, ymm4, ymm7
vpxor ymm4, ymm4, ymm6
; aes_enc_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesenc ymm0, ymm0, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesenc ymm0, ymm0, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesenc ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last:
vaesenclast ymm0, ymm0, ymm9
vpxor ymm0, ymm0, ymm4
@@ -4008,7 +4002,7 @@ L_AES_XTS_decrypt_vaes_dec_128:
vmovdqu ymm2, YMMWORD PTR [rcx+64]
vmovdqu ymm3, YMMWORD PTR [rcx+96]
; aes_dec_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
@@ -4017,76 +4011,76 @@ L_AES_XTS_decrypt_vaes_dec_128:
vpxor ymm2, ymm2, ymm9
vpxor ymm3, ymm3, ymm7
vpxor ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vaesdeclast ymm1, ymm1, ymm9
@@ -4164,55 +4158,55 @@ L_AES_XTS_decrypt_vaes_mul16_64:
vpxor ymm5, ymm5, ymm10
vpxor ymm5, ymm5, ymm9
; aes_dec_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
vpxor ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vaesdeclast ymm1, ymm1, ymm9
@@ -4256,40 +4250,40 @@ L_AES_XTS_decrypt_vaes_mul16_32:
vpxor ymm4, ymm4, ymm7
vpxor ymm4, ymm4, ymm6
; aes_dec_block
- vbroadcasti128 ymm9, [r8]
+ vbroadcasti128 ymm9, OWORD PTR [r8]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+16]
+ vbroadcasti128 ymm9, OWORD PTR [r8+16]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+32]
+ vbroadcasti128 ymm9, OWORD PTR [r8+32]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+48]
+ vbroadcasti128 ymm9, OWORD PTR [r8+48]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+64]
+ vbroadcasti128 ymm9, OWORD PTR [r8+64]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+80]
+ vbroadcasti128 ymm9, OWORD PTR [r8+80]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+96]
+ vbroadcasti128 ymm9, OWORD PTR [r8+96]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+112]
+ vbroadcasti128 ymm9, OWORD PTR [r8+112]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+128]
+ vbroadcasti128 ymm9, OWORD PTR [r8+128]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+144]
+ vbroadcasti128 ymm9, OWORD PTR [r8+144]
vaesdec ymm0, ymm0, ymm9
cmp r10d, 11
- vbroadcasti128 ymm9, [r8+160]
+ vbroadcasti128 ymm9, OWORD PTR [r8+160]
jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+176]
+ vbroadcasti128 ymm9, OWORD PTR [r8+176]
vaesdec ymm0, ymm0, ymm9
cmp r10d, 13
- vbroadcasti128 ymm9, [r8+192]
+ vbroadcasti128 ymm9, OWORD PTR [r8+192]
jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+208]
+ vbroadcasti128 ymm9, OWORD PTR [r8+208]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r8+224]
+ vbroadcasti128 ymm9, OWORD PTR [r8+224]
L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vpxor ymm0, ymm0, ymm4
@@ -4561,7 +4555,7 @@ L_AES_XTS_decrypt_update_vaes_dec_128:
vmovdqu ymm2, YMMWORD PTR [rcx+64]
vmovdqu ymm3, YMMWORD PTR [rcx+96]
; aes_dec_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
@@ -4570,76 +4564,76 @@ L_AES_XTS_decrypt_update_vaes_dec_128:
vpxor ymm2, ymm2, ymm9
vpxor ymm3, ymm3, ymm7
vpxor ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
vaesdec ymm2, ymm2, ymm9
vaesdec ymm3, ymm3, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vaesdeclast ymm1, ymm1, ymm9
@@ -4717,55 +4711,55 @@ L_AES_XTS_decrypt_update_vaes_mul16_64:
vpxor ymm5, ymm5, ymm10
vpxor ymm5, ymm5, ymm9
; aes_dec_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
vpxor ymm1, ymm1, ymm5
vpxor ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesdec ymm0, ymm0, ymm9
vaesdec ymm1, ymm1, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vaesdeclast ymm1, ymm1, ymm9
@@ -4809,40 +4803,40 @@ L_AES_XTS_decrypt_update_vaes_mul16_32:
vpxor ymm4, ymm4, ymm7
vpxor ymm4, ymm4, ymm6
; aes_dec_block
- vbroadcasti128 ymm9, [r10]
+ vbroadcasti128 ymm9, OWORD PTR [r10]
vpxor ymm0, ymm0, ymm4
vpxor ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+16]
+ vbroadcasti128 ymm9, OWORD PTR [r10+16]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+32]
+ vbroadcasti128 ymm9, OWORD PTR [r10+32]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+48]
+ vbroadcasti128 ymm9, OWORD PTR [r10+48]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+64]
+ vbroadcasti128 ymm9, OWORD PTR [r10+64]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+80]
+ vbroadcasti128 ymm9, OWORD PTR [r10+80]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+96]
+ vbroadcasti128 ymm9, OWORD PTR [r10+96]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+112]
+ vbroadcasti128 ymm9, OWORD PTR [r10+112]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+128]
+ vbroadcasti128 ymm9, OWORD PTR [r10+128]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+144]
+ vbroadcasti128 ymm9, OWORD PTR [r10+144]
vaesdec ymm0, ymm0, ymm9
cmp r9d, 11
- vbroadcasti128 ymm9, [r10+160]
+ vbroadcasti128 ymm9, OWORD PTR [r10+160]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+176]
+ vbroadcasti128 ymm9, OWORD PTR [r10+176]
vaesdec ymm0, ymm0, ymm9
cmp r9d, 13
- vbroadcasti128 ymm9, [r10+192]
+ vbroadcasti128 ymm9, OWORD PTR [r10+192]
jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+208]
+ vbroadcasti128 ymm9, OWORD PTR [r10+208]
vaesdec ymm0, ymm0, ymm9
- vbroadcasti128 ymm9, [r10+224]
+ vbroadcasti128 ymm9, OWORD PTR [r10+224]
L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last:
vaesdeclast ymm0, ymm0, ymm9
vpxor ymm0, ymm0, ymm4
@@ -5084,32 +5078,28 @@ AES_XTS_init_avx512 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_xts_gc_xts DWORD \
- 00000087h, 00000000h, 00000001h, 00000000h
+L_avx512_aes_xts_gc_xts DWORD 00000087h, 00000000h, 00000001h, 00000000h
ptr_L_avx512_aes_xts_gc_xts QWORD L_avx512_aes_xts_gc_xts
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_xts_poly DWORD \
- 00000087h, 00000000h, 00000000h, 00000000h
+L_avx512_aes_xts_poly DWORD 00000087h, 00000000h, 00000000h, 00000000h
ptr_L_avx512_aes_xts_poly QWORD L_avx512_aes_xts_poly
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_xts_shl DWORD \
- 00000000h, 00000000h, 00000000h, 00000000h,
- 00000001h, 00000000h, 00000001h, 00000000h,
- 00000002h, 00000000h, 00000002h, 00000000h,
- 00000003h, 00000000h, 00000003h, 00000000h
+L_avx512_aes_xts_shl DWORD 00000000h, 00000000h, 00000000h, 00000000h
+ DWORD 00000001h, 00000000h, 00000001h, 00000000h
+ DWORD 00000002h, 00000000h, 00000002h, 00000000h
+ DWORD 00000003h, 00000000h, 00000003h, 00000000h
ptr_L_avx512_aes_xts_shl QWORD L_avx512_aes_xts_shl
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_avx512_aes_xts_shr DWORD \
- 00000040h, 00000000h, 00000040h, 00000000h,
- 0000003fh, 00000000h, 0000003fh, 00000000h,
- 0000003eh, 00000000h, 0000003eh, 00000000h,
- 0000003dh, 00000000h, 0000003dh, 00000000h
+L_avx512_aes_xts_shr DWORD 00000040h, 00000000h, 00000040h, 00000000h
+ DWORD 0000003fh, 00000000h, 0000003fh, 00000000h
+ DWORD 0000003eh, 00000000h, 0000003eh, 00000000h
+ DWORD 0000003dh, 00000000h, 0000003dh, 00000000h
ptr_L_avx512_aes_xts_shr QWORD L_avx512_aes_xts_shr
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -5179,25 +5169,25 @@ L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last:
xor r13d, r13d
cmp eax, 32
jl L_AES_XTS_encrypt_avx512_done_128
- vbroadcasti32x4 zmm16, [r8]
- vbroadcasti32x4 zmm17, [r8+16]
- vbroadcasti32x4 zmm18, [r8+32]
- vbroadcasti32x4 zmm19, [r8+48]
- vbroadcasti32x4 zmm20, [r8+64]
- vbroadcasti32x4 zmm21, [r8+80]
- vbroadcasti32x4 zmm22, [r8+96]
- vbroadcasti32x4 zmm23, [r8+112]
- vbroadcasti32x4 zmm24, [r8+128]
- vbroadcasti32x4 zmm25, [r8+144]
- vbroadcasti32x4 zmm26, [r8+160]
+ vbroadcasti32x4 zmm16, OWORD PTR [r8]
+ vbroadcasti32x4 zmm17, OWORD PTR [r8+16]
+ vbroadcasti32x4 zmm18, OWORD PTR [r8+32]
+ vbroadcasti32x4 zmm19, OWORD PTR [r8+48]
+ vbroadcasti32x4 zmm20, OWORD PTR [r8+64]
+ vbroadcasti32x4 zmm21, OWORD PTR [r8+80]
+ vbroadcasti32x4 zmm22, OWORD PTR [r8+96]
+ vbroadcasti32x4 zmm23, OWORD PTR [r8+112]
+ vbroadcasti32x4 zmm24, OWORD PTR [r8+128]
+ vbroadcasti32x4 zmm25, OWORD PTR [r8+144]
+ vbroadcasti32x4 zmm26, OWORD PTR [r8+160]
cmp r10d, 11
jl L_AES_XTS_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm27, [r8+176]
- vbroadcasti32x4 zmm28, [r8+192]
+ vbroadcasti32x4 zmm27, OWORD PTR [r8+176]
+ vbroadcasti32x4 zmm28, OWORD PTR [r8+192]
cmp r10d, 13
jl L_AES_XTS_encrypt_avx512_key_cached
- vbroadcasti32x4 zmm29, [r8+208]
- vbroadcasti32x4 zmm30, [r8+224]
+ vbroadcasti32x4 zmm29, OWORD PTR [r8+208]
+ vbroadcasti32x4 zmm30, OWORD PTR [r8+224]
L_AES_XTS_encrypt_avx512_key_cached:
cmp eax, 256
mov r11d, eax
@@ -5665,25 +5655,25 @@ AES_XTS_encrypt_update_avx512 PROC
xor r12d, r12d
cmp eax, 32
jl L_AES_XTS_encrypt_update_avx512_done_128
- vbroadcasti32x4 zmm16, [r10]
- vbroadcasti32x4 zmm17, [r10+16]
- vbroadcasti32x4 zmm18, [r10+32]
- vbroadcasti32x4 zmm19, [r10+48]
- vbroadcasti32x4 zmm20, [r10+64]
- vbroadcasti32x4 zmm21, [r10+80]
- vbroadcasti32x4 zmm22, [r10+96]
- vbroadcasti32x4 zmm23, [r10+112]
- vbroadcasti32x4 zmm24, [r10+128]
- vbroadcasti32x4 zmm25, [r10+144]
- vbroadcasti32x4 zmm26, [r10+160]
+ vbroadcasti32x4 zmm16, OWORD PTR [r10]
+ vbroadcasti32x4 zmm17, OWORD PTR [r10+16]
+ vbroadcasti32x4 zmm18, OWORD PTR [r10+32]
+ vbroadcasti32x4 zmm19, OWORD PTR [r10+48]
+ vbroadcasti32x4 zmm20, OWORD PTR [r10+64]
+ vbroadcasti32x4 zmm21, OWORD PTR [r10+80]
+ vbroadcasti32x4 zmm22, OWORD PTR [r10+96]
+ vbroadcasti32x4 zmm23, OWORD PTR [r10+112]
+ vbroadcasti32x4 zmm24, OWORD PTR [r10+128]
+ vbroadcasti32x4 zmm25, OWORD PTR [r10+144]
+ vbroadcasti32x4 zmm26, OWORD PTR [r10+160]
cmp r9d, 11
jl L_AES_XTS_encrypt_update_avx512_key_cached
- vbroadcasti32x4 zmm27, [r10+176]
- vbroadcasti32x4 zmm28, [r10+192]
+ vbroadcasti32x4 zmm27, OWORD PTR [r10+176]
+ vbroadcasti32x4 zmm28, OWORD PTR [r10+192]
cmp r9d, 13
jl L_AES_XTS_encrypt_update_avx512_key_cached
- vbroadcasti32x4 zmm29, [r10+208]
- vbroadcasti32x4 zmm30, [r10+224]
+ vbroadcasti32x4 zmm29, OWORD PTR [r10+208]
+ vbroadcasti32x4 zmm30, OWORD PTR [r10+224]
L_AES_XTS_encrypt_update_avx512_key_cached:
cmp eax, 256
mov r11d, eax
@@ -6196,25 +6186,25 @@ L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last:
L_AES_XTS_decrypt_avx512_mul16_256:
cmp r11d, 32
jl L_AES_XTS_decrypt_avx512_done_128
- vbroadcasti32x4 zmm16, [r8]
- vbroadcasti32x4 zmm17, [r8+16]
- vbroadcasti32x4 zmm18, [r8+32]
- vbroadcasti32x4 zmm19, [r8+48]
- vbroadcasti32x4 zmm20, [r8+64]
- vbroadcasti32x4 zmm21, [r8+80]
- vbroadcasti32x4 zmm22, [r8+96]
- vbroadcasti32x4 zmm23, [r8+112]
- vbroadcasti32x4 zmm24, [r8+128]
- vbroadcasti32x4 zmm25, [r8+144]
- vbroadcasti32x4 zmm26, [r8+160]
+ vbroadcasti32x4 zmm16, OWORD PTR [r8]
+ vbroadcasti32x4 zmm17, OWORD PTR [r8+16]
+ vbroadcasti32x4 zmm18, OWORD PTR [r8+32]
+ vbroadcasti32x4 zmm19, OWORD PTR [r8+48]
+ vbroadcasti32x4 zmm20, OWORD PTR [r8+64]
+ vbroadcasti32x4 zmm21, OWORD PTR [r8+80]
+ vbroadcasti32x4 zmm22, OWORD PTR [r8+96]
+ vbroadcasti32x4 zmm23, OWORD PTR [r8+112]
+ vbroadcasti32x4 zmm24, OWORD PTR [r8+128]
+ vbroadcasti32x4 zmm25, OWORD PTR [r8+144]
+ vbroadcasti32x4 zmm26, OWORD PTR [r8+160]
cmp r10d, 11
jl L_AES_XTS_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm27, [r8+176]
- vbroadcasti32x4 zmm28, [r8+192]
+ vbroadcasti32x4 zmm27, OWORD PTR [r8+176]
+ vbroadcasti32x4 zmm28, OWORD PTR [r8+192]
cmp r10d, 13
jl L_AES_XTS_decrypt_avx512_key_cached
- vbroadcasti32x4 zmm29, [r8+208]
- vbroadcasti32x4 zmm30, [r8+224]
+ vbroadcasti32x4 zmm29, OWORD PTR [r8+208]
+ vbroadcasti32x4 zmm30, OWORD PTR [r8+224]
L_AES_XTS_decrypt_avx512_key_cached:
cmp r11d, 256
jl L_AES_XTS_decrypt_avx512_done_256
@@ -6766,25 +6756,25 @@ AES_XTS_decrypt_update_avx512 PROC
L_AES_XTS_decrypt_update_avx512_mul16_256:
cmp r11d, 32
jl L_AES_XTS_decrypt_update_avx512_done_128
- vbroadcasti32x4 zmm16, [r10]
- vbroadcasti32x4 zmm17, [r10+16]
- vbroadcasti32x4 zmm18, [r10+32]
- vbroadcasti32x4 zmm19, [r10+48]
- vbroadcasti32x4 zmm20, [r10+64]
- vbroadcasti32x4 zmm21, [r10+80]
- vbroadcasti32x4 zmm22, [r10+96]
- vbroadcasti32x4 zmm23, [r10+112]
- vbroadcasti32x4 zmm24, [r10+128]
- vbroadcasti32x4 zmm25, [r10+144]
- vbroadcasti32x4 zmm26, [r10+160]
+ vbroadcasti32x4 zmm16, OWORD PTR [r10]
+ vbroadcasti32x4 zmm17, OWORD PTR [r10+16]
+ vbroadcasti32x4 zmm18, OWORD PTR [r10+32]
+ vbroadcasti32x4 zmm19, OWORD PTR [r10+48]
+ vbroadcasti32x4 zmm20, OWORD PTR [r10+64]
+ vbroadcasti32x4 zmm21, OWORD PTR [r10+80]
+ vbroadcasti32x4 zmm22, OWORD PTR [r10+96]
+ vbroadcasti32x4 zmm23, OWORD PTR [r10+112]
+ vbroadcasti32x4 zmm24, OWORD PTR [r10+128]
+ vbroadcasti32x4 zmm25, OWORD PTR [r10+144]
+ vbroadcasti32x4 zmm26, OWORD PTR [r10+160]
cmp r9d, 11
jl L_AES_XTS_decrypt_update_avx512_key_cached
- vbroadcasti32x4 zmm27, [r10+176]
- vbroadcasti32x4 zmm28, [r10+192]
+ vbroadcasti32x4 zmm27, OWORD PTR [r10+176]
+ vbroadcasti32x4 zmm28, OWORD PTR [r10+192]
cmp r9d, 13
jl L_AES_XTS_decrypt_update_avx512_key_cached
- vbroadcasti32x4 zmm29, [r10+208]
- vbroadcasti32x4 zmm30, [r10+224]
+ vbroadcasti32x4 zmm29, OWORD PTR [r10+208]
+ vbroadcasti32x4 zmm30, OWORD PTR [r10+224]
L_AES_XTS_decrypt_update_avx512_key_cached:
cmp r11d, 256
jl L_AES_XTS_decrypt_update_avx512_done_256
diff --git a/wolfcrypt/src/chacha_asm.asm b/wolfcrypt/src/chacha_asm.asm
index b9444254c90..80afbfdb3b0 100644
--- a/wolfcrypt/src/chacha_asm.asm
+++ b/wolfcrypt/src/chacha_asm.asm
@@ -462,26 +462,22 @@ _TEXT ENDS
IFDEF HAVE_INTEL_AVX1
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx1_rotl8 QWORD \
- 0605040702010003h, 0e0d0c0f0a09080bh
+L_chacha20_avx1_rotl8 QWORD 0605040702010003h, 0e0d0c0f0a09080bh
ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx1_rotl16 QWORD \
- 0504070601000302h, 0d0c0f0e09080b0ah
+L_chacha20_avx1_rotl16 QWORD 0504070601000302h, 0d0c0f0e09080b0ah
ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx1_add QWORD \
- 0000000100000000h, 0000000300000002h
+L_chacha20_avx1_add QWORD 0000000100000000h, 0000000300000002h
ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx1_four QWORD \
- 0000000400000004h, 0000000400000004h
+L_chacha20_avx1_four QWORD 0000000400000004h, 0000000400000004h
ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -1019,30 +1015,26 @@ ENDIF
IFDEF HAVE_INTEL_AVX2
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx2_rotl8 QWORD \
- 0605040702010003h, 0e0d0c0f0a09080bh,
- 0605040702010003h, 0e0d0c0f0a09080bh
+L_chacha20_avx2_rotl8 QWORD 0605040702010003h, 0e0d0c0f0a09080bh
+ QWORD 0605040702010003h, 0e0d0c0f0a09080bh
ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx2_rotl16 QWORD \
- 0504070601000302h, 0d0c0f0e09080b0ah,
- 0504070601000302h, 0d0c0f0e09080b0ah
+L_chacha20_avx2_rotl16 QWORD 0504070601000302h, 0d0c0f0e09080b0ah
+ QWORD 0504070601000302h, 0d0c0f0e09080b0ah
ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx2_add QWORD \
- 0000000100000000h, 0000000300000002h,
- 0000000500000004h, 0000000700000006h
+L_chacha20_avx2_add QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000500000004h, 0000000700000006h
ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_chacha20_avx2_eight QWORD \
- 0000000800000008h, 0000000800000008h,
- 0000000800000008h, 0000000800000008h
+L_chacha20_avx2_eight QWORD 0000000800000008h, 0000000800000008h
+ QWORD 0000000800000008h, 0000000800000008h
ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -1093,29 +1085,29 @@ chacha_encrypt_avx2 PROC
vpbroadcastd ymm14, DWORD PTR [rcx+56]
vpbroadcastd ymm15, DWORD PTR [rcx+60]
vpaddd ymm12, ymm12, YMMWORD PTR [r15]
- vmovdqa YMMWORD PTR [r11], ymm0
- vmovdqa YMMWORD PTR [r11+32], ymm1
- vmovdqa YMMWORD PTR [r11+64], ymm2
- vmovdqa YMMWORD PTR [r11+96], ymm3
- vmovdqa YMMWORD PTR [r11+128], ymm4
- vmovdqa YMMWORD PTR [r11+160], ymm5
- vmovdqa YMMWORD PTR [r11+192], ymm6
- vmovdqa YMMWORD PTR [r11+224], ymm7
- vmovdqa YMMWORD PTR [r11+256], ymm8
- vmovdqa YMMWORD PTR [r11+288], ymm9
- vmovdqa YMMWORD PTR [r11+320], ymm10
- vmovdqa YMMWORD PTR [r11+352], ymm11
- vmovdqa YMMWORD PTR [r11+384], ymm12
- vmovdqa YMMWORD PTR [r11+416], ymm13
- vmovdqa YMMWORD PTR [r11+448], ymm14
- vmovdqa YMMWORD PTR [r11+480], ymm15
+ vmovdqu YMMWORD PTR [r11], ymm0
+ vmovdqu YMMWORD PTR [r11+32], ymm1
+ vmovdqu YMMWORD PTR [r11+64], ymm2
+ vmovdqu YMMWORD PTR [r11+96], ymm3
+ vmovdqu YMMWORD PTR [r11+128], ymm4
+ vmovdqu YMMWORD PTR [r11+160], ymm5
+ vmovdqu YMMWORD PTR [r11+192], ymm6
+ vmovdqu YMMWORD PTR [r11+224], ymm7
+ vmovdqu YMMWORD PTR [r11+256], ymm8
+ vmovdqu YMMWORD PTR [r11+288], ymm9
+ vmovdqu YMMWORD PTR [r11+320], ymm10
+ vmovdqu YMMWORD PTR [r11+352], ymm11
+ vmovdqu YMMWORD PTR [r11+384], ymm12
+ vmovdqu YMMWORD PTR [r11+416], ymm13
+ vmovdqu YMMWORD PTR [r11+448], ymm14
+ vmovdqu YMMWORD PTR [r11+480], ymm15
L_chacha20_avx2_start256:
mov r10b, 10
- vmovdqa YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+96], ymm11
L_chacha20_avx2_loop256:
vpaddd ymm0, ymm0, ymm4
vpxor ymm12, ymm12, ymm0
- vmovdqa ymm11, YMMWORD PTR [r12+96]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
vpshufb ymm12, ymm12, YMMWORD PTR [r14]
vpaddd ymm8, ymm8, ymm12
vpxor ymm4, ymm4, ymm8
@@ -1134,7 +1126,7 @@ L_chacha20_avx2_loop256:
vpshufb ymm15, ymm15, YMMWORD PTR [r14]
vpaddd ymm11, ymm11, ymm15
vpxor ymm7, ymm7, ymm11
- vmovdqa YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+96], ymm11
vpsrld ymm11, ymm4, 20
vpslld ymm4, ymm4, 12
vpxor ymm4, ymm4, ymm11
@@ -1149,7 +1141,7 @@ L_chacha20_avx2_loop256:
vpxor ymm7, ymm7, ymm11
vpaddd ymm0, ymm0, ymm4
vpxor ymm12, ymm12, ymm0
- vmovdqa ymm11, YMMWORD PTR [r12+96]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
vpshufb ymm12, ymm12, YMMWORD PTR [r13]
vpaddd ymm8, ymm8, ymm12
vpxor ymm4, ymm4, ymm8
@@ -1168,7 +1160,7 @@ L_chacha20_avx2_loop256:
vpshufb ymm15, ymm15, YMMWORD PTR [r13]
vpaddd ymm11, ymm11, ymm15
vpxor ymm7, ymm7, ymm11
- vmovdqa YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+96], ymm11
vpsrld ymm11, ymm4, 25
vpslld ymm4, ymm4, 7
vpxor ymm4, ymm4, ymm11
@@ -1183,7 +1175,7 @@ L_chacha20_avx2_loop256:
vpxor ymm7, ymm7, ymm11
vpaddd ymm0, ymm0, ymm5
vpxor ymm15, ymm15, ymm0
- vmovdqa ymm11, YMMWORD PTR [r12+96]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
vpshufb ymm15, ymm15, YMMWORD PTR [r14]
vpaddd ymm10, ymm10, ymm15
vpxor ymm5, ymm5, ymm10
@@ -1202,7 +1194,7 @@ L_chacha20_avx2_loop256:
vpshufb ymm14, ymm14, YMMWORD PTR [r14]
vpaddd ymm9, ymm9, ymm14
vpxor ymm4, ymm4, ymm9
- vmovdqa YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+96], ymm11
vpsrld ymm11, ymm5, 20
vpslld ymm5, ymm5, 12
vpxor ymm5, ymm5, ymm11
@@ -1217,7 +1209,7 @@ L_chacha20_avx2_loop256:
vpxor ymm4, ymm4, ymm11
vpaddd ymm0, ymm0, ymm5
vpxor ymm15, ymm15, ymm0
- vmovdqa ymm11, YMMWORD PTR [r12+96]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
vpshufb ymm15, ymm15, YMMWORD PTR [r13]
vpaddd ymm10, ymm10, ymm15
vpxor ymm5, ymm5, ymm10
@@ -1236,7 +1228,7 @@ L_chacha20_avx2_loop256:
vpshufb ymm14, ymm14, YMMWORD PTR [r13]
vpaddd ymm9, ymm9, ymm14
vpxor ymm4, ymm4, ymm9
- vmovdqa YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+96], ymm11
vpsrld ymm11, ymm5, 25
vpslld ymm5, ymm5, 7
vpxor ymm5, ymm5, ymm11
@@ -1251,7 +1243,7 @@ L_chacha20_avx2_loop256:
vpxor ymm4, ymm4, ymm11
dec r10b
jnz L_chacha20_avx2_loop256
- vmovdqa ymm11, YMMWORD PTR [r12+96]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
vpaddd ymm0, ymm0, YMMWORD PTR [r11]
vpaddd ymm1, ymm1, YMMWORD PTR [r11+32]
vpaddd ymm2, ymm2, YMMWORD PTR [r11+64]
@@ -1268,14 +1260,14 @@ L_chacha20_avx2_loop256:
vpaddd ymm13, ymm13, YMMWORD PTR [r11+416]
vpaddd ymm14, ymm14, YMMWORD PTR [r11+448]
vpaddd ymm15, ymm15, YMMWORD PTR [r11+480]
- vmovdqa YMMWORD PTR [r12], ymm8
- vmovdqa YMMWORD PTR [r12+32], ymm9
- vmovdqa YMMWORD PTR [r12+64], ymm10
- vmovdqa YMMWORD PTR [r12+96], ymm11
- vmovdqa YMMWORD PTR [r12+128], ymm12
- vmovdqa YMMWORD PTR [r12+160], ymm13
- vmovdqa YMMWORD PTR [r12+192], ymm14
- vmovdqa YMMWORD PTR [r12+224], ymm15
+ vmovdqu YMMWORD PTR [r12], ymm8
+ vmovdqu YMMWORD PTR [r12+32], ymm9
+ vmovdqu YMMWORD PTR [r12+64], ymm10
+ vmovdqu YMMWORD PTR [r12+96], ymm11
+ vmovdqu YMMWORD PTR [r12+128], ymm12
+ vmovdqu YMMWORD PTR [r12+160], ymm13
+ vmovdqu YMMWORD PTR [r12+192], ymm14
+ vmovdqu YMMWORD PTR [r12+224], ymm15
vpunpckldq ymm8, ymm0, ymm1
vpunpckldq ymm9, ymm2, ymm3
vpunpckhdq ymm12, ymm0, ymm1
@@ -1324,14 +1316,14 @@ L_chacha20_avx2_loop256:
vmovdqu YMMWORD PTR [r8+320], ymm13
vmovdqu YMMWORD PTR [r8+384], ymm14
vmovdqu YMMWORD PTR [r8+448], ymm15
- vmovdqa ymm0, YMMWORD PTR [r12]
- vmovdqa ymm1, YMMWORD PTR [r12+32]
- vmovdqa ymm2, YMMWORD PTR [r12+64]
- vmovdqa ymm3, YMMWORD PTR [r12+96]
- vmovdqa ymm4, YMMWORD PTR [r12+128]
- vmovdqa ymm5, YMMWORD PTR [r12+160]
- vmovdqa ymm6, YMMWORD PTR [r12+192]
- vmovdqa ymm7, YMMWORD PTR [r12+224]
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
+ vmovdqu ymm4, YMMWORD PTR [r12+128]
+ vmovdqu ymm5, YMMWORD PTR [r12+160]
+ vmovdqu ymm6, YMMWORD PTR [r12+192]
+ vmovdqu ymm7, YMMWORD PTR [r12+224]
vpunpckldq ymm8, ymm0, ymm1
vpunpckldq ymm9, ymm2, ymm3
vpunpckhdq ymm12, ymm0, ymm1
@@ -1380,30 +1372,30 @@ L_chacha20_avx2_loop256:
vmovdqu YMMWORD PTR [r8+352], ymm13
vmovdqu YMMWORD PTR [r8+416], ymm14
vmovdqu YMMWORD PTR [r8+480], ymm15
- vmovdqa ymm12, YMMWORD PTR [r11+384]
+ vmovdqu ymm12, YMMWORD PTR [r11+384]
add rdx, 512
add r8, 512
vpaddd ymm12, ymm12, YMMWORD PTR [rdi]
sub r9d, 512
- vmovdqa YMMWORD PTR [r11+384], ymm12
+ vmovdqu YMMWORD PTR [r11+384], ymm12
cmp r9d, 512
jl L_chacha20_avx2_done256
- vmovdqa ymm0, YMMWORD PTR [r11]
- vmovdqa ymm1, YMMWORD PTR [r11+32]
- vmovdqa ymm2, YMMWORD PTR [r11+64]
- vmovdqa ymm3, YMMWORD PTR [r11+96]
- vmovdqa ymm4, YMMWORD PTR [r11+128]
- vmovdqa ymm5, YMMWORD PTR [r11+160]
- vmovdqa ymm6, YMMWORD PTR [r11+192]
- vmovdqa ymm7, YMMWORD PTR [r11+224]
- vmovdqa ymm8, YMMWORD PTR [r11+256]
- vmovdqa ymm9, YMMWORD PTR [r11+288]
- vmovdqa ymm10, YMMWORD PTR [r11+320]
- vmovdqa ymm11, YMMWORD PTR [r11+352]
- vmovdqa ymm12, YMMWORD PTR [r11+384]
- vmovdqa ymm13, YMMWORD PTR [r11+416]
- vmovdqa ymm14, YMMWORD PTR [r11+448]
- vmovdqa ymm15, YMMWORD PTR [r11+480]
+ vmovdqu ymm0, YMMWORD PTR [r11]
+ vmovdqu ymm1, YMMWORD PTR [r11+32]
+ vmovdqu ymm2, YMMWORD PTR [r11+64]
+ vmovdqu ymm3, YMMWORD PTR [r11+96]
+ vmovdqu ymm4, YMMWORD PTR [r11+128]
+ vmovdqu ymm5, YMMWORD PTR [r11+160]
+ vmovdqu ymm6, YMMWORD PTR [r11+192]
+ vmovdqu ymm7, YMMWORD PTR [r11+224]
+ vmovdqu ymm8, YMMWORD PTR [r11+256]
+ vmovdqu ymm9, YMMWORD PTR [r11+288]
+ vmovdqu ymm10, YMMWORD PTR [r11+320]
+ vmovdqu ymm11, YMMWORD PTR [r11+352]
+ vmovdqu ymm12, YMMWORD PTR [r11+384]
+ vmovdqu ymm13, YMMWORD PTR [r11+416]
+ vmovdqu ymm14, YMMWORD PTR [r11+448]
+ vmovdqu ymm15, YMMWORD PTR [r11+480]
jmp L_chacha20_avx2_start256
L_chacha20_avx2_done256:
shl eax, 3
diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S
index 7e976fa1f28..d402203fc3c 100644
--- a/wolfcrypt/src/fe_x25519_asm.S
+++ b/wolfcrypt/src/fe_x25519_asm.S
@@ -2341,6 +2341,9 @@ _fe_invert_x64:
movq 128(%rsp), %rdi
addq $0x90, %rsp
repz retq
+#ifndef __APPLE__
+.size fe_invert_x64,.-fe_invert_x64
+#endif /* __APPLE__ */
#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519)
#ifndef __APPLE__
.data
@@ -7339,6 +7342,9 @@ _fe_pow22523_x64:
addq $0x70, %rsp
repz retq
#ifndef __APPLE__
+.size fe_pow22523_x64,.-fe_pow22523_x64
+#endif /* __APPLE__ */
+#ifndef __APPLE__
.text
.globl ge_p1p1_to_p2_x64
.type ge_p1p1_to_p2_x64,@function
@@ -13201,6 +13207,9 @@ _fe_invert_avx2:
movq 128(%rsp), %rdi
addq $0x90, %rsp
repz retq
+#ifndef __APPLE__
+.size fe_invert_avx2,.-fe_invert_avx2
+#endif /* __APPLE__ */
#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519)
#ifndef __APPLE__
.data
@@ -17403,6 +17412,9 @@ _fe_pow22523_avx2:
addq $0x70, %rsp
repz retq
#ifndef __APPLE__
+.size fe_pow22523_avx2,.-fe_pow22523_avx2
+#endif /* __APPLE__ */
+#ifndef __APPLE__
.text
.globl ge_p1p1_to_p2_avx2
.type ge_p1p1_to_p2_avx2,@function
diff --git a/wolfcrypt/src/fe_x25519_asm.asm b/wolfcrypt/src/fe_x25519_asm.asm
new file mode 100644
index 00000000000..fa3e671ba13
--- /dev/null
+++ b/wolfcrypt/src/fe_x25519_asm.asm
@@ -0,0 +1,19760 @@
+; /* fe_x25519_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+EXTERN cpuid_get_flags:PROC
+_TEXT SEGMENT READONLY PARA
+fe_init PROC
+IFDEF HAVE_INTEL_AVX2
+ mov eax, DWORD PTR [cpuFlagsSet]
+ test eax, eax
+ je L_fe_init_get_flags
+ ret
+L_fe_init_get_flags:
+ sub rsp, 40
+ call cpuid_get_flags
+ add rsp, 40
+ mov DWORD PTR [intelFlags], eax
+ and eax, 80
+ cmp eax, 80
+ jne L_fe_init_flags_done
+ lea rax, [fe_cmov_table_avx2]
+ mov QWORD PTR [fe_cmov_table_p], rax
+ lea rax, [fe_mul_avx2]
+ mov QWORD PTR [fe_mul_p], rax
+ lea rax, [fe_sq_avx2]
+ mov QWORD PTR [fe_sq_p], rax
+ lea rax, [fe_mul121666_avx2]
+ mov QWORD PTR [fe_mul121666_p], rax
+ lea rax, [fe_invert_avx2]
+ mov QWORD PTR [fe_invert_p], rax
+ lea rax, [curve25519_avx2]
+ mov QWORD PTR [curve25519_p], rax
+ lea rax, [fe_pow22523_avx2]
+ mov QWORD PTR [fe_pow22523_p], rax
+ lea rax, [ge_p1p1_to_p2_avx2]
+ mov QWORD PTR [ge_p1p1_to_p2_p], rax
+ lea rax, [ge_p1p1_to_p3_avx2]
+ mov QWORD PTR [ge_p1p1_to_p3_p], rax
+ lea rax, [ge_p2_dbl_avx2]
+ mov QWORD PTR [ge_p2_dbl_p], rax
+ lea rax, [ge_madd_avx2]
+ mov QWORD PTR [ge_madd_p], rax
+ lea rax, [ge_msub_avx2]
+ mov QWORD PTR [ge_msub_p], rax
+ lea rax, [ge_add_avx2]
+ mov QWORD PTR [ge_add_p], rax
+ lea rax, [ge_sub_avx2]
+ mov QWORD PTR [ge_sub_p], rax
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+ lea rax, [curve25519_base_avx2]
+ mov QWORD PTR [curve25519_base_p], rax
+ENDIF
+IFDEF HAVE_ED25519
+ lea rax, [fe_sq2_avx2]
+ mov QWORD PTR [fe_sq2_p], rax
+ lea rax, [fe_invert_nct_avx2]
+ mov QWORD PTR [fe_invert_nct_p], rax
+ lea rax, [sc_reduce_avx2]
+ mov QWORD PTR [sc_reduce_p], rax
+ lea rax, [sc_muladd_avx2]
+ mov QWORD PTR [sc_muladd_p], rax
+ENDIF
+L_fe_init_flags_done:
+ mov DWORD PTR [cpuFlagsSet], 1
+ENDIF
+ ret
+fe_init ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_frombytes PROC
+ mov r11, 9223372036854775807
+ mov rax, QWORD PTR [rdx]
+ mov r8, QWORD PTR [rdx+8]
+ mov r9, QWORD PTR [rdx+16]
+ mov r10, QWORD PTR [rdx+24]
+ and r10, r11
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r8
+ mov QWORD PTR [rcx+16], r9
+ mov QWORD PTR [rcx+24], r10
+ ret
+fe_frombytes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_tobytes PROC
+ push r12
+ mov r12, 9223372036854775807
+ mov rax, QWORD PTR [rdx]
+ mov r8, QWORD PTR [rdx+8]
+ mov r9, QWORD PTR [rdx+16]
+ mov r10, QWORD PTR [rdx+24]
+ add rax, 19
+ adc r8, 0
+ adc r9, 0
+ adc r10, 0
+ shr r10, 63
+ imul r11, r10, 19
+ mov rax, QWORD PTR [rdx]
+ mov r8, QWORD PTR [rdx+8]
+ mov r9, QWORD PTR [rdx+16]
+ mov r10, QWORD PTR [rdx+24]
+ add rax, r11
+ adc r8, 0
+ adc r9, 0
+ adc r10, 0
+ and r10, r12
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r8
+ mov QWORD PTR [rcx+16], r9
+ mov QWORD PTR [rcx+24], r10
+ pop r12
+ ret
+fe_tobytes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_1 PROC
+ ; Set one
+ mov QWORD PTR [rcx], 1
+ mov QWORD PTR [rcx+8], 0
+ mov QWORD PTR [rcx+16], 0
+ mov QWORD PTR [rcx+24], 0
+ ret
+fe_1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_0 PROC
+ ; Set zero
+ mov QWORD PTR [rcx], 0
+ mov QWORD PTR [rcx+8], 0
+ mov QWORD PTR [rcx+16], 0
+ mov QWORD PTR [rcx+24], 0
+ ret
+fe_0 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_copy PROC
+ ; Copy
+ mov rax, QWORD PTR [rdx]
+ mov r8, QWORD PTR [rdx+8]
+ mov r9, QWORD PTR [rdx+16]
+ mov r10, QWORD PTR [rdx+24]
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r8
+ mov QWORD PTR [rcx+16], r9
+ mov QWORD PTR [rcx+24], r10
+ ret
+fe_copy ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sub PROC
+ push r12
+ ; Sub
+ mov rax, QWORD PTR [rdx]
+ mov r9, QWORD PTR [rdx+8]
+ mov r10, QWORD PTR [rdx+16]
+ mov r11, QWORD PTR [rdx+24]
+ sub rax, QWORD PTR [r8]
+ sbb r9, QWORD PTR [r8+8]
+ sbb r10, QWORD PTR [r8+16]
+ sbb r11, QWORD PTR [r8+24]
+ sbb r12, r12
+ shld r12, r11, 1
+ imul r12, -19
+ btr r11, 63
+ ; Add modulus (if underflow)
+ sub rax, r12
+ sbb r9, 0
+ sbb r10, 0
+ sbb r11, 0
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r9
+ mov QWORD PTR [rcx+16], r10
+ mov QWORD PTR [rcx+24], r11
+ pop r12
+ ret
+fe_sub ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_add PROC
+ push r12
+ ; Add
+ mov rax, QWORD PTR [rdx]
+ mov r9, QWORD PTR [rdx+8]
+ add rax, QWORD PTR [r8]
+ mov r10, QWORD PTR [rdx+16]
+ adc r9, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [rdx+24]
+ adc r10, QWORD PTR [r8+16]
+ adc r11, QWORD PTR [r8+24]
+ mov r12, 0
+ adc r12, 0
+ shld r12, r11, 1
+ imul r12, 19
+ btr r11, 63
+ ; Sub modulus (if overflow)
+ add rax, r12
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r9
+ mov QWORD PTR [rcx+16], r10
+ mov QWORD PTR [rcx+24], r11
+ pop r12
+ ret
+fe_add ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_neg PROC
+ mov rax, -19
+ mov r8, -1
+ mov r9, -1
+ mov r10, 9223372036854775807
+ sub rax, QWORD PTR [rdx]
+ sbb r8, QWORD PTR [rdx+8]
+ sbb r9, QWORD PTR [rdx+16]
+ sbb r10, QWORD PTR [rdx+24]
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], r8
+ mov QWORD PTR [rcx+16], r9
+ mov QWORD PTR [rcx+24], r10
+ ret
+fe_neg ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_cmov PROC
+ push r12
+ cmp r8d, 1
+ mov r9, QWORD PTR [rcx]
+ mov r10, QWORD PTR [rcx+8]
+ mov r11, QWORD PTR [rcx+16]
+ mov r12, QWORD PTR [rcx+24]
+ cmove r9, QWORD PTR [rdx]
+ cmove r10, QWORD PTR [rdx+8]
+ cmove r11, QWORD PTR [rdx+16]
+ cmove r12, QWORD PTR [rdx+24]
+ mov QWORD PTR [rcx], r9
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+24], r12
+ pop r12
+ ret
+fe_cmov ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_isnonzero PROC
+ mov r11, 9223372036854775807
+ mov rax, QWORD PTR [rcx]
+ mov rdx, QWORD PTR [rcx+8]
+ mov r8, QWORD PTR [rcx+16]
+ mov r9, QWORD PTR [rcx+24]
+ add rax, 19
+ adc rdx, 0
+ adc r8, 0
+ adc r9, 0
+ shr r9, 63
+ imul r10, r9, 19
+ mov rax, QWORD PTR [rcx]
+ mov rdx, QWORD PTR [rcx+8]
+ mov r8, QWORD PTR [rcx+16]
+ mov r9, QWORD PTR [rcx+24]
+ add rax, r10
+ adc rdx, 0
+ adc r8, 0
+ adc r9, 0
+ and r9, r11
+ or rax, rdx
+ or rax, r8
+ or rax, r9
+ ret
+fe_isnonzero ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_isnegative PROC
+ push r12
+ mov r12, 9223372036854775807
+ mov rdx, QWORD PTR [rcx]
+ mov r8, QWORD PTR [rcx+8]
+ mov r9, QWORD PTR [rcx+16]
+ mov r10, QWORD PTR [rcx+24]
+ mov rax, rdx
+ add rdx, 19
+ adc r8, 0
+ adc r9, 0
+ adc r10, 0
+ shr r10, 63
+ imul r11, r10, 19
+ add rax, r11
+ and rax, 1
+ pop r12
+ ret
+fe_isnegative ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_cmov_table PROC
+ jmp QWORD PTR [fe_cmov_table_p]
+fe_cmov_table ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul PROC
+ jmp QWORD PTR [fe_mul_p]
+fe_mul ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sq PROC
+ jmp QWORD PTR [fe_sq_p]
+fe_sq ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul121666 PROC
+ jmp QWORD PTR [fe_mul121666_p]
+fe_mul121666 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_invert PROC
+ jmp QWORD PTR [fe_invert_p]
+fe_invert ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+curve25519 PROC
+ jmp QWORD PTR [curve25519_p]
+curve25519 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_pow22523 PROC
+ jmp QWORD PTR [fe_pow22523_p]
+fe_pow22523 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p2 PROC
+ jmp QWORD PTR [ge_p1p1_to_p2_p]
+ge_p1p1_to_p2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p3 PROC
+ jmp QWORD PTR [ge_p1p1_to_p3_p]
+ge_p1p1_to_p3 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p2_dbl PROC
+ jmp QWORD PTR [ge_p2_dbl_p]
+ge_p2_dbl ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_madd PROC
+ jmp QWORD PTR [ge_madd_p]
+ge_madd ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_msub PROC
+ jmp QWORD PTR [ge_msub_p]
+ge_msub ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_add PROC
+ jmp QWORD PTR [ge_add_p]
+ge_add ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_sub PROC
+ jmp QWORD PTR [ge_sub_p]
+ge_sub ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+_TEXT SEGMENT READONLY PARA
+curve25519_base PROC
+ jmp QWORD PTR [curve25519_base_p]
+curve25519_base ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+IFDEF HAVE_ED25519
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+fe_sq2 PROC
+ jmp QWORD PTR [fe_sq2_p]
+fe_sq2 ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+fe_invert_nct PROC
+ jmp QWORD PTR [fe_invert_nct_p]
+fe_invert_nct ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+sc_reduce PROC
+ jmp QWORD PTR [sc_reduce_p]
+sc_reduce ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+sc_muladd PROC
+ jmp QWORD PTR [sc_muladd_p]
+sc_muladd ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+_DATA SEGMENT
+cpuFlagsSet dd 0
+_DATA ENDS
+_DATA SEGMENT
+intelFlags dd 0
+_DATA ENDS
+_DATA SEGMENT
+fe_cmov_table_p dq fe_cmov_table_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_mul_p dq fe_mul_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_sq_p dq fe_sq_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_mul121666_p dq fe_mul121666_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_invert_p dq fe_invert_x64
+_DATA ENDS
+_DATA SEGMENT
+curve25519_p dq curve25519_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_pow22523_p dq fe_pow22523_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_p1p1_to_p2_p dq ge_p1p1_to_p2_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_p1p1_to_p3_p dq ge_p1p1_to_p3_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_p2_dbl_p dq ge_p2_dbl_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_madd_p dq ge_madd_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_msub_p dq ge_msub_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_add_p dq ge_add_x64
+_DATA ENDS
+_DATA SEGMENT
+ge_sub_p dq ge_sub_x64
+_DATA ENDS
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+_DATA SEGMENT
+curve25519_base_p dq curve25519_base_x64
+_DATA ENDS
+ENDIF
+IFDEF HAVE_ED25519
+_DATA SEGMENT
+fe_sq2_p dq fe_sq2_x64
+_DATA ENDS
+_DATA SEGMENT
+fe_invert_nct_p dq fe_invert_nct_x64
+_DATA ENDS
+_DATA SEGMENT
+sc_reduce_p dq sc_reduce_x64
+_DATA ENDS
+_DATA SEGMENT
+sc_muladd_p dq sc_muladd_x64
+_DATA ENDS
+ENDIF
+_TEXT SEGMENT READONLY PARA
+fe_cmov_table_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov r9, rdx
+ movsx rax, r8b
+ cdq
+ xor al, dl
+ sub al, dl
+ mov sil, al
+ mov rax, 1
+ xor rdx, rdx
+ xor r10, r10
+ xor r11, r11
+ mov r12, 1
+ xor r13, r13
+ xor r14, r14
+ xor r15, r15
+ cmp sil, 1
+ mov rdi, QWORD PTR [r9]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+8]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+16]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+24]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+32]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+40]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+48]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+56]
+ cmove r15, rdi
+ cmp sil, 2
+ mov rdi, QWORD PTR [r9+96]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+104]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+112]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+120]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+128]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+136]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+144]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+152]
+ cmove r15, rdi
+ cmp sil, 3
+ mov rdi, QWORD PTR [r9+192]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+200]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+208]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+216]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+224]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+232]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+240]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+248]
+ cmove r15, rdi
+ cmp sil, 4
+ mov rdi, QWORD PTR [r9+288]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+296]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+304]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+312]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+320]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+328]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+336]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+344]
+ cmove r15, rdi
+ cmp sil, 5
+ mov rdi, QWORD PTR [r9+384]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+392]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+400]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+408]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+416]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+424]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+432]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+440]
+ cmove r15, rdi
+ cmp sil, 6
+ mov rdi, QWORD PTR [r9+480]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+488]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+496]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+504]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+512]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+520]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+528]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+536]
+ cmove r15, rdi
+ cmp sil, 7
+ mov rdi, QWORD PTR [r9+576]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+584]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+592]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+600]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+608]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+616]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+624]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+632]
+ cmove r15, rdi
+ cmp sil, 8
+ mov rdi, QWORD PTR [r9+672]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+680]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+688]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+696]
+ cmove r11, rdi
+ mov rdi, QWORD PTR [r9+704]
+ cmove r12, rdi
+ mov rdi, QWORD PTR [r9+712]
+ cmove r13, rdi
+ mov rdi, QWORD PTR [r9+720]
+ cmove r14, rdi
+ mov rdi, QWORD PTR [r9+728]
+ cmove r15, rdi
+ cmp r8b, 0
+ mov rdi, rax
+ cmovl rax, r12
+ cmovl r12, rdi
+ mov rdi, rdx
+ cmovl rdx, r13
+ cmovl r13, rdi
+ mov rdi, r10
+ cmovl r10, r14
+ cmovl r14, rdi
+ mov rdi, r11
+ cmovl r11, r15
+ cmovl r15, rdi
+ mov QWORD PTR [rcx], rax
+ mov QWORD PTR [rcx+8], rdx
+ mov QWORD PTR [rcx+16], r10
+ mov QWORD PTR [rcx+24], r11
+ mov QWORD PTR [rcx+32], r12
+ mov QWORD PTR [rcx+40], r13
+ mov QWORD PTR [rcx+48], r14
+ mov QWORD PTR [rcx+56], r15
+ xor rax, rax
+ xor rdx, rdx
+ xor r10, r10
+ xor r11, r11
+ cmp sil, 1
+ mov rdi, QWORD PTR [r9+64]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+72]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+80]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+88]
+ cmove r11, rdi
+ cmp sil, 2
+ mov rdi, QWORD PTR [r9+160]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+168]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+176]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+184]
+ cmove r11, rdi
+ cmp sil, 3
+ mov rdi, QWORD PTR [r9+256]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+264]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+272]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+280]
+ cmove r11, rdi
+ cmp sil, 4
+ mov rdi, QWORD PTR [r9+352]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+360]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+368]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+376]
+ cmove r11, rdi
+ cmp sil, 5
+ mov rdi, QWORD PTR [r9+448]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+456]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+464]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+472]
+ cmove r11, rdi
+ cmp sil, 6
+ mov rdi, QWORD PTR [r9+544]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+552]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+560]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+568]
+ cmove r11, rdi
+ cmp sil, 7
+ mov rdi, QWORD PTR [r9+640]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+648]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+656]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+664]
+ cmove r11, rdi
+ cmp sil, 8
+ mov rdi, QWORD PTR [r9+736]
+ cmove rax, rdi
+ mov rdi, QWORD PTR [r9+744]
+ cmove rdx, rdi
+ mov rdi, QWORD PTR [r9+752]
+ cmove r10, rdi
+ mov rdi, QWORD PTR [r9+760]
+ cmove r11, rdi
+ mov r12, -19
+ mov r13, -1
+ mov r14, -1
+ mov r15, 9223372036854775807
+ sub r12, rax
+ sbb r13, rdx
+ sbb r14, r10
+ sbb r15, r11
+ cmp r8b, 0
+ cmovl rax, r12
+ cmovl rdx, r13
+ cmovl r10, r14
+ cmovl r11, r15
+ mov QWORD PTR [rcx+64], rax
+ mov QWORD PTR [rcx+72], rdx
+ mov QWORD PTR [rcx+80], r10
+ mov QWORD PTR [rcx+88], r11
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_cmov_table_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r9, rdx
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r9]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r9]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r9+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r9]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r9+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r9+16]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [r9]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r9+8]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r9+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r9+24]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [r9+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r9+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r9+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [r9+16]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r9+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [r9+24]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add r10, rbx
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ mov rbx, 9223372036854775807
+ mov rax, r13
+ sar rax, 63
+ and rax, 19
+ and r13, rbx
+ add r10, rax
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Store
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_mul_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sq_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov r8, rdx
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov r9, rax
+ mov rsi, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r10, rsi
+ adc r11, rax
+ adc rdx, 0
+ mov rsi, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r12, rsi
+ adc r13, rax
+ adc rdx, 0
+ mov rsi, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rsi
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rsi, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rsi
+ mov rsi, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add r9, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add r9, rsi
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ mov rsi, 9223372036854775807
+ mov rax, r12
+ sar rax, 63
+ and rax, 19
+ and r12, rsi
+ add r9, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Store
+ mov QWORD PTR [rcx], r9
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+24], r12
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_sq_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sq_n_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r9, rdx
+L_fe_sq_n_x64:
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+16]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+24]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+16]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+24]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r9+24]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r9]
+ mul rax
+ mov r10, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r9+8]
+ mul rax
+ add r11, rbx
+ adc r12, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r9+16]
+ mul rax
+ add r13, rbx
+ adc r14, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r9+24]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbx
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add r10, rbx
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ dec r8b
+ jnz L_fe_sq_n_x64
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_sq_n_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul121666_x64 PROC
+ push r12
+ push r13
+ push r14
+ mov r8, rdx
+ ; Multiply by 121666
+ mov rax, 121666
+ mul QWORD PTR [r8]
+ xor r12, r12
+ mov r10, rax
+ mov r11, rdx
+ mov rax, 121666
+ mul QWORD PTR [r8+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ mov rax, 121666
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ mov rax, 121666
+ mul QWORD PTR [r8+24]
+ mov r9, 9223372036854775807
+ add r13, rax
+ adc r14, rdx
+ shld r14, r13, 1
+ and r13, r9
+ mov rax, 19
+ mul r14
+ add r10, rax
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_mul121666_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_invert_x64 PROC
+ sub rsp, 144
+ ; Invert
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], rdx
+ mov rcx, rsp
+ mov rdx, QWORD PTR [rsp+136]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, QWORD PTR [rsp+136]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ mov rcx, rsp
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 19
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 99
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 4
+ call fe_sq_n_x64
+ mov rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ mov rdx, QWORD PTR [rsp+136]
+ mov rcx, QWORD PTR [rsp+128]
+ add rsp, 144
+ ret
+fe_invert_x64 ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+_DATA SEGMENT
+ALIGN 16
+L_curve25519_base_x64_x2 QWORD 5cae469cdd684efbh, 8f3f5ced1e350b5ch
+ QWORD 0d9750c687d157114h, 20d342d51873f1b7h
+ptr_L_curve25519_base_x64_x2 QWORD L_curve25519_base_x64_x2
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+curve25519_base_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r8, rcx
+ mov r9, rdx
+ sub rsp, 168
+ xor rsi, rsi
+ mov QWORD PTR [rsp+160], r8
+ ; Set base point x
+ mov QWORD PTR [r8], 9
+ mov QWORD PTR [r8+8], 0
+ mov QWORD PTR [r8+16], 0
+ mov QWORD PTR [r8+24], 0
+ ; Set one
+ mov QWORD PTR [rsp], 1
+ mov QWORD PTR [rsp+8], 0
+ mov QWORD PTR [rsp+16], 0
+ mov QWORD PTR [rsp+24], 0
+ mov rcx, QWORD PTR [ptr_L_curve25519_base_x64_x2]
+ mov r10, QWORD PTR [ptr_L_curve25519_base_x64_x2+8]
+ mov r11, QWORD PTR [ptr_L_curve25519_base_x64_x2+16]
+ mov r12, QWORD PTR [ptr_L_curve25519_base_x64_x2+24]
+ ; Set one
+ mov QWORD PTR [rsp+32], 1
+ mov QWORD PTR [rsp+40], 0
+ mov QWORD PTR [rsp+48], 0
+ mov QWORD PTR [rsp+56], 0
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r10
+ mov QWORD PTR [rsp+80], r11
+ mov QWORD PTR [rsp+88], r12
+ mov rbp, 253
+L_curve25519_base_x64_bits:
+ mov r10, rbp
+ mov rcx, rbp
+ and rcx, 63
+ shr r10, 6
+ mov rbx, QWORD PTR [r9+8*r10]
+ shr rbx, cl
+ and rbx, 1
+ xor rsi, rbx
+ neg rsi
+ ; Conditional Swap
+ mov rcx, QWORD PTR [r8]
+ mov r10, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [r8+16]
+ mov r12, QWORD PTR [r8+24]
+ mov r13, QWORD PTR [rsp]
+ mov r14, QWORD PTR [rsp+8]
+ mov r15, QWORD PTR [rsp+16]
+ mov rdi, QWORD PTR [rsp+24]
+ xor rcx, QWORD PTR [rsp+64]
+ xor r10, QWORD PTR [rsp+72]
+ xor r11, QWORD PTR [rsp+80]
+ xor r12, QWORD PTR [rsp+88]
+ xor r13, QWORD PTR [rsp+32]
+ xor r14, QWORD PTR [rsp+40]
+ xor r15, QWORD PTR [rsp+48]
+ xor rdi, QWORD PTR [rsp+56]
+ and rcx, rsi
+ and r10, rsi
+ and r11, rsi
+ and r12, rsi
+ and r13, rsi
+ and r14, rsi
+ and r15, rsi
+ and rdi, rsi
+ xor QWORD PTR [r8], rcx
+ xor QWORD PTR [r8+8], r10
+ xor QWORD PTR [r8+16], r11
+ xor QWORD PTR [r8+24], r12
+ xor QWORD PTR [rsp], r13
+ xor QWORD PTR [rsp+8], r14
+ xor QWORD PTR [rsp+16], r15
+ xor QWORD PTR [rsp+24], rdi
+ xor QWORD PTR [rsp+64], rcx
+ xor QWORD PTR [rsp+72], r10
+ xor QWORD PTR [rsp+80], r11
+ xor QWORD PTR [rsp+88], r12
+ xor QWORD PTR [rsp+32], r13
+ xor QWORD PTR [rsp+40], r14
+ xor QWORD PTR [rsp+48], r15
+ xor QWORD PTR [rsp+56], rdi
+ mov rsi, rbx
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [r8]
+ mov r10, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [r8+16]
+ mov r12, QWORD PTR [r8+24]
+ mov r13, rcx
+ add rcx, QWORD PTR [rsp]
+ mov r14, r10
+ adc r10, QWORD PTR [rsp+8]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+16]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r12, 1
+ imul rbx, 19
+ btr r12, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbx
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Sub
+ sub r13, QWORD PTR [rsp]
+ sbb r14, QWORD PTR [rsp+8]
+ sbb r15, QWORD PTR [rsp+16]
+ sbb rdi, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rdi, 1
+ imul rbx, -19
+ btr rdi, 63
+ ; Add modulus (if underflow)
+ sub r13, rbx
+ sbb r14, 0
+ sbb r15, 0
+ sbb rdi, 0
+ mov QWORD PTR [r8], rcx
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ mov QWORD PTR [rsp+128], r13
+ mov QWORD PTR [rsp+136], r14
+ mov QWORD PTR [rsp+144], r15
+ mov QWORD PTR [rsp+152], rdi
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [rsp+64]
+ mov r10, QWORD PTR [rsp+72]
+ mov r11, QWORD PTR [rsp+80]
+ mov r12, QWORD PTR [rsp+88]
+ mov r13, rcx
+ add rcx, QWORD PTR [rsp+32]
+ mov r14, r10
+ adc r10, QWORD PTR [rsp+40]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+48]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r12, 1
+ imul rbx, 19
+ btr r12, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbx
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Sub
+ sub r13, QWORD PTR [rsp+32]
+ sbb r14, QWORD PTR [rsp+40]
+ sbb r15, QWORD PTR [rsp+48]
+ sbb rdi, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rdi, 1
+ imul rbx, -19
+ btr rdi, 63
+ ; Add modulus (if underflow)
+ sub r13, rbx
+ sbb r14, 0
+ sbb r15, 0
+ sbb rdi, 0
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r10
+ mov QWORD PTR [rsp+48], r11
+ mov QWORD PTR [rsp+56], r12
+ mov QWORD PTR [rsp+96], r13
+ mov QWORD PTR [rsp+104], r14
+ mov QWORD PTR [rsp+112], r15
+ mov QWORD PTR [rsp+120], rdi
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+32]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+32]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+40]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+32]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+40]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+48]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+32]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+40]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+48]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+56]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+40]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+48]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+56]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+48]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+56]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+56]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r10
+ mov QWORD PTR [rsp+48], r11
+ mov QWORD PTR [rsp+56], r12
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [rsp+96]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [rsp+96]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [rsp+104]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [rsp+96]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [rsp+104]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [rsp+112]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [rsp+96]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [rsp+104]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [rsp+112]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [rsp+120]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [rsp+104]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [rsp+112]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [rsp+120]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [rsp+112]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [rsp+120]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r8+24]
+ mul QWORD PTR [rsp+120]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r10
+ mov QWORD PTR [rsp+16], r11
+ mov QWORD PTR [rsp+24], r12
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+136]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+144]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+152]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+144]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+152]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r10
+ mov QWORD PTR [rsp+112], r11
+ mov QWORD PTR [rsp+120], r12
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r10
+ mov QWORD PTR [rsp+144], r11
+ mov QWORD PTR [rsp+152], r12
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [rsp]
+ mov r10, QWORD PTR [rsp+8]
+ mov r11, QWORD PTR [rsp+16]
+ mov r12, QWORD PTR [rsp+24]
+ mov r13, rcx
+ add rcx, QWORD PTR [rsp+32]
+ mov r14, r10
+ adc r10, QWORD PTR [rsp+40]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+48]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r12, 1
+ imul rbx, 19
+ btr r12, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbx
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Sub
+ sub r13, QWORD PTR [rsp+32]
+ sbb r14, QWORD PTR [rsp+40]
+ sbb r15, QWORD PTR [rsp+48]
+ sbb rdi, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rdi, 1
+ imul rbx, -19
+ btr rdi, 63
+ ; Add modulus (if underflow)
+ sub r13, rbx
+ sbb r14, 0
+ sbb r15, 0
+ sbb rdi, 0
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r10
+ mov QWORD PTR [rsp+80], r11
+ mov QWORD PTR [rsp+88], r12
+ mov QWORD PTR [rsp+32], r13
+ mov QWORD PTR [rsp+40], r14
+ mov QWORD PTR [rsp+48], r15
+ mov QWORD PTR [rsp+56], rdi
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [r8], rcx
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ ; Sub
+ mov rcx, QWORD PTR [rsp+128]
+ mov r10, QWORD PTR [rsp+136]
+ mov r11, QWORD PTR [rsp+144]
+ mov r12, QWORD PTR [rsp+152]
+ sub rcx, QWORD PTR [rsp+96]
+ sbb r10, QWORD PTR [rsp+104]
+ sbb r11, QWORD PTR [rsp+112]
+ sbb r12, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r12, 1
+ imul rbx, -19
+ btr r12, 63
+ ; Add modulus (if underflow)
+ sub rcx, rbx
+ sbb r10, 0
+ sbb r11, 0
+ sbb r12, 0
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r10
+ mov QWORD PTR [rsp+144], r11
+ mov QWORD PTR [rsp+152], r12
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+40]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+48]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+56]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [rsp+48]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [rsp+56]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [rsp+56]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r10
+ mov QWORD PTR [rsp+48], r11
+ mov QWORD PTR [rsp+56], r12
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+72]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+80]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+88]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+72]
+ mul QWORD PTR [rsp+80]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+72]
+ mul QWORD PTR [rsp+88]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+80]
+ mul QWORD PTR [rsp+88]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+64]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+72]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+80]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+88]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r10
+ mov QWORD PTR [rsp+80], r11
+ mov QWORD PTR [rsp+88], r12
+ ; Multiply by 121666
+ mov rax, 121666
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ mov rcx, rax
+ mov r10, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+144]
+ xor r14, r14
+ add r11, rax
+ adc r12, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+152]
+ mov r13, 9223372036854775807
+ add r12, rax
+ adc r14, rdx
+ add rcx, QWORD PTR [rsp+96]
+ adc r10, QWORD PTR [rsp+104]
+ adc r11, QWORD PTR [rsp+112]
+ adc r12, QWORD PTR [rsp+120]
+ adc r14, 0
+ shld r14, r12, 1
+ and r12, r13
+ mov rax, 19
+ mul r14
+ add rcx, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r10
+ mov QWORD PTR [rsp+112], r11
+ mov QWORD PTR [rsp+120], r12
+ ; Multiply by 9
+ mov rax, 9
+ mul QWORD PTR [rsp+32]
+ xor r11, r11
+ mov rcx, rax
+ mov r10, rdx
+ mov rax, 9
+ mul QWORD PTR [rsp+40]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ mov rax, 9
+ mul QWORD PTR [rsp+48]
+ xor r14, r14
+ add r11, rax
+ adc r12, rdx
+ mov rax, 9
+ mul QWORD PTR [rsp+56]
+ mov r13, 9223372036854775807
+ add r12, rax
+ adc r14, rdx
+ shld r14, r12, 1
+ and r12, r13
+ mov rax, 19
+ mul r14
+ add rcx, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r10
+ mov QWORD PTR [rsp+48], r11
+ mov QWORD PTR [rsp+56], r12
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r10
+ mov QWORD PTR [rsp+16], r11
+ mov QWORD PTR [rsp+24], r12
+ dec rbp
+ cmp rbp, 3
+ jge L_curve25519_base_x64_bits
+ neg rsi
+ ; Conditional Swap
+ mov rcx, QWORD PTR [r8]
+ mov r10, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [r8+16]
+ mov r12, QWORD PTR [r8+24]
+ mov r13, QWORD PTR [rsp]
+ mov r14, QWORD PTR [rsp+8]
+ mov r15, QWORD PTR [rsp+16]
+ mov rdi, QWORD PTR [rsp+24]
+ xor rcx, QWORD PTR [rsp+64]
+ xor r10, QWORD PTR [rsp+72]
+ xor r11, QWORD PTR [rsp+80]
+ xor r12, QWORD PTR [rsp+88]
+ xor r13, QWORD PTR [rsp+32]
+ xor r14, QWORD PTR [rsp+40]
+ xor r15, QWORD PTR [rsp+48]
+ xor rdi, QWORD PTR [rsp+56]
+ and rcx, rsi
+ and r10, rsi
+ and r11, rsi
+ and r12, rsi
+ and r13, rsi
+ and r14, rsi
+ and r15, rsi
+ and rdi, rsi
+ xor QWORD PTR [r8], rcx
+ xor QWORD PTR [r8+8], r10
+ xor QWORD PTR [r8+16], r11
+ xor QWORD PTR [r8+24], r12
+ xor QWORD PTR [rsp], r13
+ xor QWORD PTR [rsp+8], r14
+ xor QWORD PTR [rsp+16], r15
+ xor QWORD PTR [rsp+24], rdi
+ xor QWORD PTR [rsp+64], rcx
+ xor QWORD PTR [rsp+72], r10
+ xor QWORD PTR [rsp+80], r11
+ xor QWORD PTR [rsp+88], r12
+ xor QWORD PTR [rsp+32], r13
+ xor QWORD PTR [rsp+40], r14
+ xor QWORD PTR [rsp+48], r15
+ xor QWORD PTR [rsp+56], rdi
+L_curve25519_base_x64_3:
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [r8]
+ mov r10, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [r8+16]
+ mov r12, QWORD PTR [r8+24]
+ mov r13, rcx
+ add rcx, QWORD PTR [rsp]
+ mov r14, r10
+ adc r10, QWORD PTR [rsp+8]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+16]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r12, 1
+ imul rbx, 19
+ btr r12, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbx
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Sub
+ sub r13, QWORD PTR [rsp]
+ sbb r14, QWORD PTR [rsp+8]
+ sbb r15, QWORD PTR [rsp+16]
+ sbb rdi, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rdi, 1
+ imul rbx, -19
+ btr rdi, 63
+ ; Add modulus (if underflow)
+ sub r13, rbx
+ sbb r14, 0
+ sbb r15, 0
+ sbb rdi, 0
+ mov QWORD PTR [r8], rcx
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ mov QWORD PTR [rsp+128], r13
+ mov QWORD PTR [rsp+136], r14
+ mov QWORD PTR [rsp+144], r15
+ mov QWORD PTR [rsp+152], rdi
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+136]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+144]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+152]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+144]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+152]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r10
+ mov QWORD PTR [rsp+112], r11
+ mov QWORD PTR [rsp+120], r12
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov rcx, rax
+ mov rbx, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r10, rbx
+ adc r11, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r12, rbx
+ adc r13, rax
+ adc rdx, 0
+ mov rbx, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rbx
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r10
+ mov QWORD PTR [rsp+144], r11
+ mov QWORD PTR [rsp+152], r12
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [r8], rcx
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ ; Sub
+ mov rcx, QWORD PTR [rsp+128]
+ mov r10, QWORD PTR [rsp+136]
+ mov r11, QWORD PTR [rsp+144]
+ mov r12, QWORD PTR [rsp+152]
+ sub rcx, QWORD PTR [rsp+96]
+ sbb r10, QWORD PTR [rsp+104]
+ sbb r11, QWORD PTR [rsp+112]
+ sbb r12, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r12, 1
+ imul rbx, -19
+ btr r12, 63
+ ; Add modulus (if underflow)
+ sub rcx, rbx
+ sbb r10, 0
+ sbb r11, 0
+ sbb r12, 0
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r10
+ mov QWORD PTR [rsp+144], r11
+ mov QWORD PTR [rsp+152], r12
+ ; Multiply by 121666
+ mov rax, 121666
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ mov rcx, rax
+ mov r10, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+144]
+ xor r14, r14
+ add r11, rax
+ adc r12, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+152]
+ mov r13, 9223372036854775807
+ add r12, rax
+ adc r14, rdx
+ add rcx, QWORD PTR [rsp+96]
+ adc r10, QWORD PTR [rsp+104]
+ adc r11, QWORD PTR [rsp+112]
+ adc r12, QWORD PTR [rsp+120]
+ adc r14, 0
+ shld r14, r12, 1
+ and r12, r13
+ mov rax, 19
+ mul r14
+ add rcx, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r10
+ mov QWORD PTR [rsp+112], r11
+ mov QWORD PTR [rsp+120], r12
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r10
+ mov QWORD PTR [rsp+16], r11
+ mov QWORD PTR [rsp+24], r12
+ dec rbp
+ jge L_curve25519_base_x64_3
+ ; Invert
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 4
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 19
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 99
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ mov r8, QWORD PTR [rsp+160]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r8]
+ mov rcx, rax
+ mov r10, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r8]
+ xor r11, r11
+ add r10, rax
+ adc r11, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r8+8]
+ xor r12, r12
+ add r10, rax
+ adc r11, rdx
+ adc r12, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r8]
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r8+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r8+16]
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r8+8]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r8+16]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r8+24]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r8+16]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rbx, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rbx
+ mov rbx, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add rcx, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add rcx, rbx
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ mov rbx, 9223372036854775807
+ mov rax, r12
+ sar rax, 63
+ and rax, 19
+ and r12, rbx
+ add rcx, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ mov rax, 9223372036854775807
+ mov rdx, rcx
+ add rdx, 19
+ mov rdx, r10
+ adc rdx, 0
+ mov rdx, r11
+ adc rdx, 0
+ mov rdx, r12
+ adc rdx, 0
+ sar rdx, 63
+ and rdx, 19
+ and r12, rax
+ add rcx, rdx
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ and r12, rax
+ ; Store
+ mov QWORD PTR [r8], rcx
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ xor rax, rax
+ add rsp, 168
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+curve25519_base_x64 ENDP
+_TEXT ENDS
+ENDIF
+_TEXT SEGMENT READONLY PARA
+curve25519_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, rcx
+ mov r10, rdx
+ sub rsp, 176
+ xor rbx, rbx
+ mov QWORD PTR [rsp+168], r9
+ ; Set one
+ mov QWORD PTR [r9], 1
+ mov QWORD PTR [r9+8], 0
+ mov QWORD PTR [r9+16], 0
+ mov QWORD PTR [r9+24], 0
+ ; Set zero
+ mov QWORD PTR [rsp], 0
+ mov QWORD PTR [rsp+8], 0
+ mov QWORD PTR [rsp+16], 0
+ mov QWORD PTR [rsp+24], 0
+ ; Set one
+ mov QWORD PTR [rsp+32], 1
+ mov QWORD PTR [rsp+40], 0
+ mov QWORD PTR [rsp+48], 0
+ mov QWORD PTR [rsp+56], 0
+ ; Copy
+ mov rcx, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ mov r12, QWORD PTR [r8+16]
+ mov r13, QWORD PTR [r8+24]
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ mov r11, 254
+L_curve25519_x64_bits:
+ mov QWORD PTR [rsp+160], r11
+ mov rcx, r11
+ and rcx, 63
+ shr r11, 6
+ mov rbp, QWORD PTR [r10+8*r11]
+ shr rbp, cl
+ and rbp, 1
+ xor rbx, rbp
+ neg rbx
+ ; Conditional Swap
+ mov rcx, QWORD PTR [r9]
+ mov r11, QWORD PTR [r9+8]
+ mov r12, QWORD PTR [r9+16]
+ mov r13, QWORD PTR [r9+24]
+ mov r14, QWORD PTR [rsp]
+ mov r15, QWORD PTR [rsp+8]
+ mov rdi, QWORD PTR [rsp+16]
+ mov rsi, QWORD PTR [rsp+24]
+ xor rcx, QWORD PTR [rsp+64]
+ xor r11, QWORD PTR [rsp+72]
+ xor r12, QWORD PTR [rsp+80]
+ xor r13, QWORD PTR [rsp+88]
+ xor r14, QWORD PTR [rsp+32]
+ xor r15, QWORD PTR [rsp+40]
+ xor rdi, QWORD PTR [rsp+48]
+ xor rsi, QWORD PTR [rsp+56]
+ and rcx, rbx
+ and r11, rbx
+ and r12, rbx
+ and r13, rbx
+ and r14, rbx
+ and r15, rbx
+ and rdi, rbx
+ and rsi, rbx
+ xor QWORD PTR [r9], rcx
+ xor QWORD PTR [r9+8], r11
+ xor QWORD PTR [r9+16], r12
+ xor QWORD PTR [r9+24], r13
+ xor QWORD PTR [rsp], r14
+ xor QWORD PTR [rsp+8], r15
+ xor QWORD PTR [rsp+16], rdi
+ xor QWORD PTR [rsp+24], rsi
+ xor QWORD PTR [rsp+64], rcx
+ xor QWORD PTR [rsp+72], r11
+ xor QWORD PTR [rsp+80], r12
+ xor QWORD PTR [rsp+88], r13
+ xor QWORD PTR [rsp+32], r14
+ xor QWORD PTR [rsp+40], r15
+ xor QWORD PTR [rsp+48], rdi
+ xor QWORD PTR [rsp+56], rsi
+ mov rbx, rbp
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [r9]
+ mov r11, QWORD PTR [r9+8]
+ mov r12, QWORD PTR [r9+16]
+ mov r13, QWORD PTR [r9+24]
+ mov r14, rcx
+ add rcx, QWORD PTR [rsp]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+8]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+16]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+24]
+ mov rbp, 0
+ adc rbp, 0
+ shld rbp, r13, 1
+ imul rbp, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbp
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp]
+ sbb r15, QWORD PTR [rsp+8]
+ sbb rdi, QWORD PTR [rsp+16]
+ sbb rsi, QWORD PTR [rsp+24]
+ sbb rbp, rbp
+ shld rbp, rsi, 1
+ imul rbp, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbp
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [r9], rcx
+ mov QWORD PTR [r9+8], r11
+ mov QWORD PTR [r9+16], r12
+ mov QWORD PTR [r9+24], r13
+ mov QWORD PTR [rsp+128], r14
+ mov QWORD PTR [rsp+136], r15
+ mov QWORD PTR [rsp+144], rdi
+ mov QWORD PTR [rsp+152], rsi
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [rsp+64]
+ mov r11, QWORD PTR [rsp+72]
+ mov r12, QWORD PTR [rsp+80]
+ mov r13, QWORD PTR [rsp+88]
+ mov r14, rcx
+ add rcx, QWORD PTR [rsp+32]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+40]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+48]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+56]
+ mov rbp, 0
+ adc rbp, 0
+ shld rbp, r13, 1
+ imul rbp, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbp
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp+32]
+ sbb r15, QWORD PTR [rsp+40]
+ sbb rdi, QWORD PTR [rsp+48]
+ sbb rsi, QWORD PTR [rsp+56]
+ sbb rbp, rbp
+ shld rbp, rsi, 1
+ imul rbp, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbp
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ mov QWORD PTR [rsp+96], r14
+ mov QWORD PTR [rsp+104], r15
+ mov QWORD PTR [rsp+112], rdi
+ mov QWORD PTR [rsp+120], rsi
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+32]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+32]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+40]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+32]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+40]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+48]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+32]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+40]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+48]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+56]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+40]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+48]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+56]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+48]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+56]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul QWORD PTR [rsp+56]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rsp+96]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rsp+96]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rsp+104]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rsp+96]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rsp+104]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rsp+112]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rsp+96]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rsp+104]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rsp+112]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rsp+120]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rsp+104]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rsp+112]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rsp+120]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rsp+112]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rsp+120]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rsp+120]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+136]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+144]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+152]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+144]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+152]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+16]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+24]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+16]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+24]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r9+24]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r9]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r9+8]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r9+16]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r9+24]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [rsp]
+ mov r11, QWORD PTR [rsp+8]
+ mov r12, QWORD PTR [rsp+16]
+ mov r13, QWORD PTR [rsp+24]
+ mov r14, rcx
+ add rcx, QWORD PTR [rsp+32]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+40]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+48]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+56]
+ mov rbp, 0
+ adc rbp, 0
+ shld rbp, r13, 1
+ imul rbp, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbp
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp+32]
+ sbb r15, QWORD PTR [rsp+40]
+ sbb rdi, QWORD PTR [rsp+48]
+ sbb rsi, QWORD PTR [rsp+56]
+ sbb rbp, rbp
+ shld rbp, rsi, 1
+ imul rbp, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbp
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ mov QWORD PTR [rsp+32], r14
+ mov QWORD PTR [rsp+40], r15
+ mov QWORD PTR [rsp+48], rdi
+ mov QWORD PTR [rsp+56], rsi
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [r9], rcx
+ mov QWORD PTR [r9+8], r11
+ mov QWORD PTR [r9+16], r12
+ mov QWORD PTR [r9+24], r13
+ ; Sub
+ mov rcx, QWORD PTR [rsp+128]
+ mov r11, QWORD PTR [rsp+136]
+ mov r12, QWORD PTR [rsp+144]
+ mov r13, QWORD PTR [rsp+152]
+ sub rcx, QWORD PTR [rsp+96]
+ sbb r11, QWORD PTR [rsp+104]
+ sbb r12, QWORD PTR [rsp+112]
+ sbb r13, QWORD PTR [rsp+120]
+ sbb rbp, rbp
+ shld rbp, r13, 1
+ imul rbp, -19
+ btr r13, 63
+ ; Add modulus (if underflow)
+ sub rcx, rbp
+ sbb r11, 0
+ sbb r12, 0
+ sbb r13, 0
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+40]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+48]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [rsp+56]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [rsp+48]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [rsp+56]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [rsp+56]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+72]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+80]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+64]
+ mul QWORD PTR [rsp+88]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+72]
+ mul QWORD PTR [rsp+80]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+72]
+ mul QWORD PTR [rsp+88]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+80]
+ mul QWORD PTR [rsp+88]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+64]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+72]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+80]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+88]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+64], rcx
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ ; Multiply by 121666
+ mov rax, 121666
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ mov rcx, rax
+ mov r11, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+144]
+ xor r15, r15
+ add r12, rax
+ adc r13, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+152]
+ mov r14, 9223372036854775807
+ add r13, rax
+ adc r15, rdx
+ add rcx, QWORD PTR [rsp+96]
+ adc r11, QWORD PTR [rsp+104]
+ adc r12, QWORD PTR [rsp+112]
+ adc r13, QWORD PTR [rsp+120]
+ adc r15, 0
+ shld r15, r13, 1
+ and r13, r14
+ mov rax, 19
+ mul r15
+ add rcx, rax
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [r8]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [r8]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [r8+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [r8]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [r8+16]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul QWORD PTR [r8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [r8+8]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+32]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul QWORD PTR [r8+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+40]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul QWORD PTR [r8+16]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+48]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+56]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+32], rcx
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ mov r11, QWORD PTR [rsp+160]
+ dec r11
+ cmp r11, 3
+ jge L_curve25519_x64_bits
+ mov QWORD PTR [rsp+160], 2
+ neg rbx
+ ; Conditional Swap
+ mov rcx, QWORD PTR [r9]
+ mov r11, QWORD PTR [r9+8]
+ mov r12, QWORD PTR [r9+16]
+ mov r13, QWORD PTR [r9+24]
+ mov r14, QWORD PTR [rsp]
+ mov r15, QWORD PTR [rsp+8]
+ mov rdi, QWORD PTR [rsp+16]
+ mov rsi, QWORD PTR [rsp+24]
+ xor rcx, QWORD PTR [rsp+64]
+ xor r11, QWORD PTR [rsp+72]
+ xor r12, QWORD PTR [rsp+80]
+ xor r13, QWORD PTR [rsp+88]
+ xor r14, QWORD PTR [rsp+32]
+ xor r15, QWORD PTR [rsp+40]
+ xor rdi, QWORD PTR [rsp+48]
+ xor rsi, QWORD PTR [rsp+56]
+ and rcx, rbx
+ and r11, rbx
+ and r12, rbx
+ and r13, rbx
+ and r14, rbx
+ and r15, rbx
+ and rdi, rbx
+ and rsi, rbx
+ xor QWORD PTR [r9], rcx
+ xor QWORD PTR [r9+8], r11
+ xor QWORD PTR [r9+16], r12
+ xor QWORD PTR [r9+24], r13
+ xor QWORD PTR [rsp], r14
+ xor QWORD PTR [rsp+8], r15
+ xor QWORD PTR [rsp+16], rdi
+ xor QWORD PTR [rsp+24], rsi
+ xor QWORD PTR [rsp+64], rcx
+ xor QWORD PTR [rsp+72], r11
+ xor QWORD PTR [rsp+80], r12
+ xor QWORD PTR [rsp+88], r13
+ xor QWORD PTR [rsp+32], r14
+ xor QWORD PTR [rsp+40], r15
+ xor QWORD PTR [rsp+48], rdi
+ xor QWORD PTR [rsp+56], rsi
+L_curve25519_x64_3:
+ ; Add-Sub
+ ; Add
+ mov rcx, QWORD PTR [r9]
+ mov r11, QWORD PTR [r9+8]
+ mov r12, QWORD PTR [r9+16]
+ mov r13, QWORD PTR [r9+24]
+ mov r14, rcx
+ add rcx, QWORD PTR [rsp]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+8]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+16]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+24]
+ mov rbp, 0
+ adc rbp, 0
+ shld rbp, r13, 1
+ imul rbp, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add rcx, rbp
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp]
+ sbb r15, QWORD PTR [rsp+8]
+ sbb rdi, QWORD PTR [rsp+16]
+ sbb rsi, QWORD PTR [rsp+24]
+ sbb rbp, rbp
+ shld rbp, rsi, 1
+ imul rbp, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbp
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [r9], rcx
+ mov QWORD PTR [r9+8], r11
+ mov QWORD PTR [r9+16], r12
+ mov QWORD PTR [r9+24], r13
+ mov QWORD PTR [rsp+128], r14
+ mov QWORD PTR [rsp+136], r15
+ mov QWORD PTR [rsp+144], rdi
+ mov QWORD PTR [rsp+152], rsi
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+136]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+144]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rsp+128]
+ mul QWORD PTR [rsp+152]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+144]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rsp+136]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rsp+144]
+ mul QWORD PTR [rsp+152]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rsp+128]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rsp+136]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rsp+144]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rsp+152]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+16]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+24]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+16]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+24]
+ add r14, rax
+ adc r15, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r9+24]
+ xor rdi, rdi
+ add r15, rax
+ adc rdi, rdx
+ ; Double
+ xor rsi, rsi
+ add r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r9]
+ mul rax
+ mov rcx, rax
+ mov rbp, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r9+8]
+ mul rax
+ add r11, rbp
+ adc r12, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r9+16]
+ mul rax
+ add r13, rbp
+ adc r14, rax
+ adc rdx, 0
+ mov rbp, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r9+24]
+ mul rax
+ add rdi, rax
+ adc rsi, rdx
+ add r15, rbp
+ adc rdi, 0
+ adc rsi, 0
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [r9], rcx
+ mov QWORD PTR [r9+8], r11
+ mov QWORD PTR [r9+16], r12
+ mov QWORD PTR [r9+24], r13
+ ; Sub
+ mov rcx, QWORD PTR [rsp+128]
+ mov r11, QWORD PTR [rsp+136]
+ mov r12, QWORD PTR [rsp+144]
+ mov r13, QWORD PTR [rsp+152]
+ sub rcx, QWORD PTR [rsp+96]
+ sbb r11, QWORD PTR [rsp+104]
+ sbb r12, QWORD PTR [rsp+112]
+ sbb r13, QWORD PTR [rsp+120]
+ sbb rbp, rbp
+ shld rbp, r13, 1
+ imul rbp, -19
+ btr r13, 63
+ ; Add modulus (if underflow)
+ sub rcx, rbp
+ sbb r11, 0
+ sbb r12, 0
+ sbb r13, 0
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Multiply by 121666
+ mov rax, 121666
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ mov rcx, rax
+ mov r11, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+144]
+ xor r15, r15
+ add r12, rax
+ adc r13, rdx
+ mov rax, 121666
+ mul QWORD PTR [rsp+152]
+ mov r14, 9223372036854775807
+ add r13, rax
+ adc r15, rdx
+ add rcx, QWORD PTR [rsp+96]
+ adc r11, QWORD PTR [rsp+104]
+ adc r12, QWORD PTR [rsp+112]
+ adc r13, QWORD PTR [rsp+120]
+ adc r15, 0
+ shld r15, r13, 1
+ and r13, r14
+ mov rax, 19
+ mul r15
+ add rcx, rax
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+128]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+128]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+136]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+128]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+136]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+144]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+128]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+136]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+144]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp+96]
+ mul QWORD PTR [rsp+152]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+136]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+144]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+104]
+ mul QWORD PTR [rsp+152]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+144]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+112]
+ mul QWORD PTR [rsp+152]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+120]
+ mul QWORD PTR [rsp+152]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ ; Store
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ dec QWORD PTR [rsp+160]
+ jge L_curve25519_x64_3
+ ; Invert
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 4
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 19
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 99
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ mov r9, QWORD PTR [rsp+168]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r9]
+ mov rcx, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r9]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r9+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r9]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r9+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r9+16]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r9]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r9+8]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r9+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rsp]
+ mul QWORD PTR [r9+24]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r9+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r9+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rsp+8]
+ mul QWORD PTR [r9+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r9+16]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rsp+16]
+ mul QWORD PTR [r9+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rsp+24]
+ mul QWORD PTR [r9+24]
+ add rdi, rax
+ adc rsi, rdx
+ mov rax, 38
+ mul rsi
+ add r13, rax
+ adc rdx, 0
+ mov rbp, 9223372036854775807
+ shld rdx, r13, 1
+ imul rdx, rdx, 19
+ and r13, rbp
+ mov rbp, rdx
+ mov rax, 38
+ mul r14
+ xor r14, r14
+ add rcx, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ adc rdi, rdx
+ add rcx, rbp
+ adc r11, r14
+ adc r12, r15
+ adc r13, rdi
+ mov rbp, 9223372036854775807
+ mov rax, r13
+ sar rax, 63
+ and rax, 19
+ and r13, rbp
+ add rcx, rax
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov rax, 9223372036854775807
+ mov rdx, rcx
+ add rdx, 19
+ mov rdx, r11
+ adc rdx, 0
+ mov rdx, r12
+ adc rdx, 0
+ mov rdx, r13
+ adc rdx, 0
+ sar rdx, 63
+ and rdx, 19
+ and r13, rax
+ add rcx, rdx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ and r13, rax
+ ; Store
+ mov QWORD PTR [r9], rcx
+ mov QWORD PTR [r9+8], r11
+ mov QWORD PTR [r9+16], r12
+ mov QWORD PTR [r9+24], r13
+ xor rax, rax
+ add rsp, 176
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+curve25519_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_pow22523_x64 PROC
+ sub rsp, 112
+ ; pow22523
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], rdx
+ mov rcx, rsp
+ mov rdx, QWORD PTR [rsp+104]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, QWORD PTR [rsp+104]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ mov rcx, rsp
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 4
+ call fe_sq_n_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 9
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 19
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 9
+ call fe_sq_n_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 49
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 99
+ call fe_sq_n_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_x64
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 49
+ call fe_sq_n_x64
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_x64
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_x64
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_x64
+ mov rcx, QWORD PTR [rsp+96]
+ mov rdx, rsp
+ mov r8, QWORD PTR [rsp+104]
+ call fe_mul_x64
+ mov rdx, QWORD PTR [rsp+104]
+ mov rcx, QWORD PTR [rsp+96]
+ add rsp, 112
+ ret
+fe_pow22523_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p2_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r8, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov r9, r8
+ add r9, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add r8, 64
+ add rcx, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ mov r9, r8
+ sub r9, 32
+ sub rcx, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add rsp, 16
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p1p1_to_p2_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p3_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r8, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov r9, r8
+ add r9, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ mov r9, r8
+ add r9, 32
+ add rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add r8, 64
+ sub rcx, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ mov r9, r8
+ add r9, 32
+ add rcx, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r11, rax
+ mov r12, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add rsp, 16
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p1p1_to_p3_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p2_dbl_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r8, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ add rcx, 64
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor rsi, rsi
+ add rdi, rax
+ adc rsi, rdx
+ ; Double
+ xor rbx, rbx
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, rsi
+ adc rbx, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov r11, rax
+ mov r10, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r12, r10
+ adc r13, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r14, r10
+ adc r15, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add rsi, rax
+ adc rbx, rdx
+ add rdi, r10
+ adc rsi, 0
+ adc rbx, 0
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add r8, 32
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor rsi, rsi
+ add rdi, rax
+ adc rsi, rdx
+ ; Double
+ xor rbx, rbx
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, rsi
+ adc rbx, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov r11, rax
+ mov r10, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r12, r10
+ adc r13, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r14, r10
+ adc r15, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add rsi, rax
+ adc rbx, rdx
+ add rdi, r10
+ adc rsi, 0
+ adc rbx, 0
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov r8, rcx
+ sub rcx, 32
+ ; Add-Sub
+ ; Add
+ mov r15, r11
+ add r11, QWORD PTR [r8]
+ mov rdi, r12
+ adc r12, QWORD PTR [r8+8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+16]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+24]
+ mov r10, 0
+ adc r10, 0
+ shld r10, r14, 1
+ imul r10, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, r10
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Sub
+ sub r15, QWORD PTR [r8]
+ sbb rdi, QWORD PTR [r8+8]
+ sbb rsi, QWORD PTR [r8+16]
+ sbb rbx, QWORD PTR [r8+24]
+ sbb r10, r10
+ shld r10, rbx, 1
+ imul r10, -19
+ btr rbx, 63
+ ; Add modulus (if underflow)
+ sub r15, r10
+ sbb rdi, 0
+ sbb rsi, 0
+ sbb rbx, 0
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ mov QWORD PTR [r8], r15
+ mov QWORD PTR [r8+8], rdi
+ mov QWORD PTR [r8+16], rsi
+ mov QWORD PTR [r8+24], rbx
+ mov r9, QWORD PTR [rsp+8]
+ mov r8, r9
+ add r8, 32
+ sub rcx, 32
+ ; Add
+ mov r11, QWORD PTR [r8]
+ mov r12, QWORD PTR [r8+8]
+ add r11, QWORD PTR [r9]
+ mov r13, QWORD PTR [r8+16]
+ adc r12, QWORD PTR [r9+8]
+ mov r14, QWORD PTR [r8+24]
+ adc r13, QWORD PTR [r9+16]
+ adc r14, QWORD PTR [r9+24]
+ mov r10, 0
+ adc r10, 0
+ shld r10, r14, 1
+ imul r10, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, r10
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ ; Square
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [rcx]
+ mul QWORD PTR [rcx+8]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [rcx]
+ mul QWORD PTR [rcx+16]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [rcx]
+ mul QWORD PTR [rcx+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [rcx+8]
+ mul QWORD PTR [rcx+16]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [rcx+8]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [rcx+16]
+ mul QWORD PTR [rcx+24]
+ xor rsi, rsi
+ add rdi, rax
+ adc rsi, rdx
+ ; Double
+ xor rbx, rbx
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, rsi
+ adc rbx, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [rcx]
+ mul rax
+ mov r11, rax
+ mov r10, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [rcx+8]
+ mul rax
+ add r12, r10
+ adc r13, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [rcx+16]
+ mul rax
+ add r14, r10
+ adc r15, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [rcx+24]
+ mul rax
+ add rsi, rax
+ adc rbx, rdx
+ add rdi, r10
+ adc rsi, 0
+ adc rbx, 0
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ ; Store
+ mov r8, rcx
+ add r8, 32
+ ; Sub
+ sub r11, QWORD PTR [r8]
+ sbb r12, QWORD PTR [r8+8]
+ sbb r13, QWORD PTR [r8+16]
+ sbb r14, QWORD PTR [r8+24]
+ sbb r10, r10
+ shld r10, r14, 1
+ imul r10, -19
+ btr r14, 63
+ ; Add modulus (if underflow)
+ sub r11, r10
+ sbb r12, 0
+ sbb r13, 0
+ sbb r14, 0
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add r9, 64
+ ; Square * 2
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+8]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+16]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r9+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+16]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r9+24]
+ add r15, rax
+ adc rdi, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r9+24]
+ xor rsi, rsi
+ add rdi, rax
+ adc rsi, rdx
+ ; Double
+ xor rbx, rbx
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, rdi
+ adc rsi, rsi
+ adc rbx, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r9]
+ mul rax
+ mov r11, rax
+ mov r10, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r9+8]
+ mul rax
+ add r12, r10
+ adc r13, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r9+16]
+ mul rax
+ add r14, r10
+ adc r15, rax
+ adc rdx, 0
+ mov r10, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r9+24]
+ mul rax
+ add rsi, rax
+ adc rbx, rdx
+ add rdi, r10
+ adc rsi, 0
+ adc rbx, 0
+ mov rax, 38
+ mul rbx
+ add r14, rax
+ adc rdx, 0
+ mov r10, 9223372036854775807
+ shld rdx, r14, 1
+ imul rdx, rdx, 19
+ and r14, r10
+ mov r10, rdx
+ mov rax, 38
+ mul r15
+ xor r15, r15
+ add r11, rax
+ mov rax, 38
+ adc r15, rdx
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ adc rsi, rdx
+ add r11, r10
+ adc r12, r15
+ adc r13, rdi
+ adc r14, rsi
+ mov rax, r14
+ shld r14, r13, 1
+ shld r13, r12, 1
+ shld r12, r11, 1
+ shl r11, 1
+ mov r10, 9223372036854775807
+ shr rax, 62
+ and r14, r10
+ imul rax, rax, 19
+ add r11, rax
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Store
+ mov r8, rcx
+ add r8, 64
+ add rcx, 96
+ ; Sub
+ sub r11, QWORD PTR [r8]
+ sbb r12, QWORD PTR [r8+8]
+ sbb r13, QWORD PTR [r8+16]
+ sbb r14, QWORD PTR [r8+24]
+ sbb r10, r10
+ shld r10, r14, 1
+ imul r10, -19
+ btr r14, 63
+ ; Add modulus (if underflow)
+ sub r11, r10
+ sbb r12, 0
+ sbb r13, 0
+ sbb r14, 0
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ add rsp, 16
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p2_dbl_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_madd_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, r8
+ mov r8, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov QWORD PTR [rsp+16], r9
+ mov r10, r8
+ mov r9, r8
+ add r9, 32
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ mov r14, QWORD PTR [r9+16]
+ mov r15, QWORD PTR [r9+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r10]
+ mov rsi, r13
+ adc r13, QWORD PTR [r10+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r10+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r10+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r10]
+ sbb rsi, QWORD PTR [r10+8]
+ sbb rbx, QWORD PTR [r10+16]
+ sbb rbp, QWORD PTR [r10+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ mov r9, QWORD PTR [rsp+16]
+ add r9, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r8+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r8+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ add r10, 96
+ add r9, 32
+ add rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ sub r9, 64
+ sub rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ sub r10, 32
+ ; Double
+ mov r12, QWORD PTR [r10]
+ mov r13, QWORD PTR [r10+8]
+ add r12, r12
+ mov r14, QWORD PTR [r10+16]
+ adc r13, r13
+ mov r15, QWORD PTR [r10+24]
+ adc r14, r14
+ adc r15, r15
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov r8, rcx
+ add r8, 96
+ add rcx, 64
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_madd_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_msub_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, r8
+ mov r8, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov QWORD PTR [rsp+16], r9
+ mov r10, r8
+ mov r9, r8
+ add r9, 32
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ mov r14, QWORD PTR [r9+16]
+ mov r15, QWORD PTR [r9+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r10]
+ mov rsi, r13
+ adc r13, QWORD PTR [r10+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r10+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r10+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r10]
+ sbb rsi, QWORD PTR [r10+8]
+ sbb rbx, QWORD PTR [r10+16]
+ sbb rbp, QWORD PTR [r10+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ mov r9, QWORD PTR [rsp+16]
+ add rcx, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add r10, 96
+ add r9, 64
+ add rcx, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ sub r9, 32
+ sub rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ sub r10, 32
+ add rcx, 64
+ ; Double
+ mov r12, QWORD PTR [r10]
+ mov r13, QWORD PTR [r10+8]
+ add r12, r12
+ mov r14, QWORD PTR [r10+16]
+ adc r13, r13
+ mov r15, QWORD PTR [r10+24]
+ adc r14, r14
+ adc r15, r15
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_msub_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_add_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, r8
+ mov r8, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov QWORD PTR [rsp+16], r9
+ mov r10, r8
+ mov r9, r8
+ add r9, 32
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ mov r14, QWORD PTR [r9+16]
+ mov r15, QWORD PTR [r9+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r10]
+ mov rsi, r13
+ adc r13, QWORD PTR [r10+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r10+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r10+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r10]
+ sbb rsi, QWORD PTR [r10+8]
+ sbb rbx, QWORD PTR [r10+16]
+ sbb rbp, QWORD PTR [r10+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ mov r9, QWORD PTR [rsp+16]
+ add r9, 32
+ add rcx, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add r10, 96
+ add r9, 64
+ add rcx, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ sub r9, 96
+ sub rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ sub r10, 32
+ add r9, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ add rcx, 64
+ ; Double
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_add_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_sub_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, r8
+ mov r8, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], r8
+ mov QWORD PTR [rsp+16], r9
+ mov r10, r8
+ mov r9, r8
+ add r9, 32
+ mov r8, rcx
+ add r8, 32
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ mov r14, QWORD PTR [r9+16]
+ mov r15, QWORD PTR [r9+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r10]
+ mov rsi, r13
+ adc r13, QWORD PTR [r10+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r10+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r10+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r10]
+ sbb rsi, QWORD PTR [r10+8]
+ sbb rbx, QWORD PTR [r10+16]
+ sbb rbp, QWORD PTR [r10+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ mov r9, QWORD PTR [rsp+16]
+ add rcx, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add r10, 96
+ add r9, 96
+ add rcx, 64
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ sub r9, 64
+ sub rcx, 96
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [rcx+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [rcx+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [rcx+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [rcx+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [r8]
+ mov rsi, r13
+ adc r13, QWORD PTR [r8+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r8+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r8+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r8]
+ sbb rsi, QWORD PTR [r8+8]
+ sbb rbx, QWORD PTR [r8+16]
+ sbb rbp, QWORD PTR [r8+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ sub r10, 32
+ add r9, 32
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10]
+ mov r12, rax
+ mov r13, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10]
+ xor r14, r14
+ add r13, rax
+ adc r14, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10]
+ add r14, rax
+ adc r15, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+8]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+16]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [r9]
+ mul QWORD PTR [r10+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+8]
+ xor rbx, rbx
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+16]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [r9+8]
+ mul QWORD PTR [r10+24]
+ add rdi, rax
+ adc rsi, rdx
+ adc rbx, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+16]
+ xor rbp, rbp
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [r9+16]
+ mul QWORD PTR [r10+24]
+ add rsi, rax
+ adc rbx, rdx
+ adc rbp, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [r9+24]
+ mul QWORD PTR [r10+24]
+ add rbx, rax
+ adc rbp, rdx
+ mov rax, 38
+ mul rbp
+ add r15, rax
+ adc rdx, 0
+ mov r11, 9223372036854775807
+ shld rdx, r15, 1
+ imul rdx, rdx, 19
+ and r15, r11
+ mov r11, rdx
+ mov rax, 38
+ mul rdi
+ xor rdi, rdi
+ add r12, rax
+ mov rax, 38
+ adc rdi, rdx
+ mul rsi
+ xor rsi, rsi
+ add r13, rax
+ mov rax, 38
+ adc rsi, rdx
+ mul rbx
+ xor rbx, rbx
+ add r14, rax
+ adc rbx, rdx
+ add r12, r11
+ adc r13, rdi
+ adc r14, rsi
+ adc r15, rbx
+ ; Store
+ ; Double
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov r8, rcx
+ add r8, 64
+ add rcx, 96
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [rcx]
+ mov rsi, r13
+ adc r13, QWORD PTR [rcx+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rcx+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rcx+24]
+ mov r11, 0
+ adc r11, 0
+ shld r11, r15, 1
+ imul r11, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rcx]
+ sbb rsi, QWORD PTR [rcx+8]
+ sbb rbx, QWORD PTR [rcx+16]
+ sbb rbp, QWORD PTR [rcx+24]
+ sbb r11, r11
+ shld r11, rbp, 1
+ imul r11, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, r11
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [r8], rdi
+ mov QWORD PTR [r8+8], rsi
+ mov QWORD PTR [r8+16], rbx
+ mov QWORD PTR [r8+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_sub_x64 ENDP
+_TEXT ENDS
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+fe_sq2_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov r8, rdx
+ ; Square * 2
+ ; A[0] * A[1]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+8]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * A[2]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+16]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[0] * A[3]
+ mov rax, QWORD PTR [r8]
+ mul QWORD PTR [r8+24]
+ xor r13, r13
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * A[2]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+16]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[1] * A[3]
+ mov rax, QWORD PTR [r8+8]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ ; A[2] * A[3]
+ mov rax, QWORD PTR [r8+16]
+ mul QWORD PTR [r8+24]
+ xor r15, r15
+ add r14, rax
+ adc r15, rdx
+ ; Double
+ xor rdi, rdi
+ add r10, r10
+ adc r11, r11
+ adc r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ adc rdi, 0
+ ; A[0] * A[0]
+ mov rax, QWORD PTR [r8]
+ mul rax
+ mov r9, rax
+ mov rsi, rdx
+ ; A[1] * A[1]
+ mov rax, QWORD PTR [r8+8]
+ mul rax
+ add r10, rsi
+ adc r11, rax
+ adc rdx, 0
+ mov rsi, rdx
+ ; A[2] * A[2]
+ mov rax, QWORD PTR [r8+16]
+ mul rax
+ add r12, rsi
+ adc r13, rax
+ adc rdx, 0
+ mov rsi, rdx
+ ; A[3] * A[3]
+ mov rax, QWORD PTR [r8+24]
+ mul rax
+ add r15, rax
+ adc rdi, rdx
+ add r14, rsi
+ adc r15, 0
+ adc rdi, 0
+ mov rax, 38
+ mul rdi
+ add r12, rax
+ adc rdx, 0
+ mov rsi, 9223372036854775807
+ shld rdx, r12, 1
+ imul rdx, rdx, 19
+ and r12, rsi
+ mov rsi, rdx
+ mov rax, 38
+ mul r13
+ xor r13, r13
+ add r9, rax
+ mov rax, 38
+ adc r13, rdx
+ mul r14
+ xor r14, r14
+ add r10, rax
+ mov rax, 38
+ adc r14, rdx
+ mul r15
+ xor r15, r15
+ add r11, rax
+ adc r15, rdx
+ add r9, rsi
+ adc r10, r13
+ adc r11, r14
+ adc r12, r15
+ mov rax, r12
+ shld r12, r11, 1
+ shld r11, r10, 1
+ shld r10, r9, 1
+ shl r9, 1
+ mov rsi, 9223372036854775807
+ shr rax, 62
+ and r12, rsi
+ imul rax, rax, 19
+ add r9, rax
+ adc r10, 0
+ adc r11, 0
+ adc r12, 0
+ ; Store
+ mov QWORD PTR [rcx], r9
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+24], r12
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_sq2_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+sc_reduce_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r9, QWORD PTR [rcx]
+ mov r10, QWORD PTR [rcx+8]
+ mov r11, QWORD PTR [rcx+16]
+ mov r12, QWORD PTR [rcx+24]
+ mov r13, QWORD PTR [rcx+32]
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+48]
+ mov rdi, QWORD PTR [rcx+56]
+ mov r8, rdi
+ mov rsi, 1152921504606846975
+ shr r8, 56
+ shld rdi, r15, 4
+ shld r15, r14, 4
+ shld r14, r13, 4
+ shld r13, r12, 4
+ and r12, rsi
+ and rdi, rsi
+ ; Add order times bits 504..511
+ sub r15, r8
+ sbb rdi, 0
+ mov rax, 16942830013509034793
+ mul r8
+ mov rsi, 0
+ add r14, rax
+ mov rax, 12100500283911187475
+ adc rsi, rdx
+ mul r8
+ add r13, rax
+ adc r14, rdx
+ adc r15, rsi
+ adc rdi, 0
+ ; Sub product of top 4 words and order
+ mov r8, 12100500283911187475
+ mov rax, r13
+ mul r8
+ mov rbp, 0
+ add r9, rax
+ adc rbp, rdx
+ mov rax, r14
+ mul r8
+ mov rsi, 0
+ add r10, rax
+ adc rsi, rdx
+ mov rax, r15
+ mul r8
+ add r10, rbp
+ adc r11, rax
+ adc r12, rdx
+ mov rbx, 0
+ adc rbx, 0
+ mov rax, rdi
+ mul r8
+ add r11, rsi
+ adc r12, rax
+ adc rbx, rdx
+ mov r8, 16942830013509034793
+ mov rax, r13
+ mul r8
+ mov rbp, 0
+ add r10, rax
+ adc rbp, rdx
+ mov rax, r14
+ mul r8
+ mov rsi, 0
+ add r11, rax
+ adc rsi, rdx
+ mov rax, r15
+ mul r8
+ add r11, rbp
+ adc r12, rax
+ adc rbx, rdx
+ mov rbp, 0
+ adc rbp, 0
+ mov rax, rdi
+ mul r8
+ add r12, rsi
+ adc rbx, rax
+ adc rbp, rdx
+ sub r11, r13
+ mov r13, rbx
+ sbb r12, r14
+ mov r14, rbp
+ sbb r13, r15
+ sbb r14, rdi
+ mov r8, r14
+ sar r8, 57
+ ; Conditionally subtract order starting at bit 125
+ mov rax, 11529215046068469760
+ mov rdx, 14628338529006959229
+ mov rbx, 187989257525064602
+ mov rbp, 144115188075855872
+ and rax, r8
+ and rdx, r8
+ and rbx, r8
+ and rbp, r8
+ add r10, rax
+ adc r11, rdx
+ adc r12, rbx
+ adc r13, 0
+ adc r14, rbp
+ ; Move bits 252-376 to own registers
+ mov r8, 1152921504606846975
+ shld r14, r13, 4
+ shld r13, r12, 4
+ and r12, r8
+ ; Sub product of top 2 words and order
+ ; * -5812631a5cf5d3ed
+ mov r8, 12100500283911187475
+ mov rax, r13
+ mul r8
+ mov rbx, 0
+ add r9, rax
+ adc r10, rdx
+ adc rbx, 0
+ mov rax, r14
+ mul r8
+ add r10, rax
+ adc rbx, rdx
+ ; * -14def9dea2f79cd7
+ mov r8, 16942830013509034793
+ mov rax, r13
+ mul r8
+ mov rbp, 0
+ add r10, rax
+ adc r11, rdx
+ adc rbp, 0
+ mov rax, r14
+ mul r8
+ add r11, rax
+ adc rbp, rdx
+ ; Add overflows at 2 * 64
+ mov rsi, 1152921504606846975
+ and r12, rsi
+ add r11, rbx
+ adc r12, rbp
+ ; Subtract top at 2 * 64
+ sub r11, r13
+ sbb r12, r14
+ sbb rsi, rsi
+ ; Conditional sub order
+ mov rax, 6346243789798364141
+ mov rdx, 1503914060200516822
+ mov rbx, 1152921504606846976
+ and rax, rsi
+ and rdx, rsi
+ and rbx, rsi
+ add r9, rax
+ mov rax, 1152921504606846975
+ adc r10, rdx
+ adc r11, 0
+ adc r12, rbx
+ and r12, rax
+ ; Store result
+ mov QWORD PTR [rcx], r9
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+24], r12
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+sc_reduce_x64 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+sc_muladd_x64 PROC
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov rbp, r8
+ mov r8, rdx
+ ; Multiply
+ ; A[0] * B[0]
+ mov rax, QWORD PTR [rbp]
+ mul QWORD PTR [r8]
+ mov r10, rax
+ mov r11, rdx
+ ; A[0] * B[1]
+ mov rax, QWORD PTR [rbp+8]
+ mul QWORD PTR [r8]
+ xor r12, r12
+ add r11, rax
+ adc r12, rdx
+ ; A[1] * B[0]
+ mov rax, QWORD PTR [rbp]
+ mul QWORD PTR [r8+8]
+ xor r13, r13
+ add r11, rax
+ adc r12, rdx
+ adc r13, 0
+ ; A[0] * B[2]
+ mov rax, QWORD PTR [rbp+16]
+ mul QWORD PTR [r8]
+ add r12, rax
+ adc r13, rdx
+ ; A[1] * B[1]
+ mov rax, QWORD PTR [rbp+8]
+ mul QWORD PTR [r8+8]
+ xor r14, r14
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[2] * B[0]
+ mov rax, QWORD PTR [rbp]
+ mul QWORD PTR [r8+16]
+ add r12, rax
+ adc r13, rdx
+ adc r14, 0
+ ; A[0] * B[3]
+ mov rax, QWORD PTR [rbp+24]
+ mul QWORD PTR [r8]
+ xor r15, r15
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[2]
+ mov rax, QWORD PTR [rbp+16]
+ mul QWORD PTR [r8+8]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[2] * B[1]
+ mov rax, QWORD PTR [rbp+8]
+ mul QWORD PTR [r8+16]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[3] * B[0]
+ mov rax, QWORD PTR [rbp]
+ mul QWORD PTR [r8+24]
+ add r13, rax
+ adc r14, rdx
+ adc r15, 0
+ ; A[1] * B[3]
+ mov rax, QWORD PTR [rbp+24]
+ mul QWORD PTR [r8+8]
+ xor rdi, rdi
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[2]
+ mov rax, QWORD PTR [rbp+16]
+ mul QWORD PTR [r8+16]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[3] * B[1]
+ mov rax, QWORD PTR [rbp+8]
+ mul QWORD PTR [r8+24]
+ add r14, rax
+ adc r15, rdx
+ adc rdi, 0
+ ; A[2] * B[3]
+ mov rax, QWORD PTR [rbp+24]
+ mul QWORD PTR [r8+16]
+ xor rsi, rsi
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[2]
+ mov rax, QWORD PTR [rbp+16]
+ mul QWORD PTR [r8+24]
+ add r15, rax
+ adc rdi, rdx
+ adc rsi, 0
+ ; A[3] * B[3]
+ mov rax, QWORD PTR [rbp+24]
+ mul QWORD PTR [r8+24]
+ add rdi, rax
+ adc rsi, rdx
+ ; Add c to a * b
+ add r10, QWORD PTR [r9]
+ adc r11, QWORD PTR [r9+8]
+ adc r12, QWORD PTR [r9+16]
+ adc r13, QWORD PTR [r9+24]
+ adc r14, 0
+ adc r15, 0
+ adc rdi, 0
+ adc rsi, 0
+ mov rbx, rsi
+ mov r9, 1152921504606846975
+ shr rbx, 56
+ shld rsi, rdi, 4
+ shld rdi, r15, 4
+ shld r15, r14, 4
+ shld r14, r13, 4
+ and r13, r9
+ and rsi, r9
+ ; Add order times bits 504..507
+ sub rdi, rbx
+ sbb rsi, 0
+ mov rax, 16942830013509034793
+ mul rbx
+ mov r9, 0
+ add r15, rax
+ mov rax, 12100500283911187475
+ adc r9, rdx
+ mul rbx
+ add r14, rax
+ adc r15, rdx
+ adc rdi, r9
+ adc rsi, 0
+ ; Sub product of top 4 words and order
+ mov rbx, 12100500283911187475
+ mov rax, r14
+ mul rbx
+ mov rbp, 0
+ add r10, rax
+ adc rbp, rdx
+ mov rax, r15
+ mul rbx
+ mov r9, 0
+ add r11, rax
+ adc r9, rdx
+ mov rax, rdi
+ mul rbx
+ add r11, rbp
+ adc r12, rax
+ adc r13, rdx
+ mov r8, 0
+ adc r8, 0
+ mov rax, rsi
+ mul rbx
+ add r12, r9
+ adc r13, rax
+ adc r8, rdx
+ mov rbx, 16942830013509034793
+ mov rax, r14
+ mul rbx
+ mov rbp, 0
+ add r11, rax
+ adc rbp, rdx
+ mov rax, r15
+ mul rbx
+ mov r9, 0
+ add r12, rax
+ adc r9, rdx
+ mov rax, rdi
+ mul rbx
+ add r12, rbp
+ adc r13, rax
+ adc r8, rdx
+ mov rbp, 0
+ adc rbp, 0
+ mov rax, rsi
+ mul rbx
+ add r13, r9
+ adc r8, rax
+ adc rbp, rdx
+ sub r12, r14
+ mov r14, r8
+ sbb r13, r15
+ mov r15, rbp
+ sbb r14, rdi
+ sbb r15, rsi
+ mov rbx, r15
+ sar rbx, 57
+ ; Conditionally subtract order starting at bit 125
+ mov rax, 11529215046068469760
+ mov rdx, 14628338529006959229
+ mov r8, 187989257525064602
+ mov rbp, 144115188075855872
+ and rax, rbx
+ and rdx, rbx
+ and r8, rbx
+ and rbp, rbx
+ add r11, rax
+ adc r12, rdx
+ adc r13, r8
+ adc r14, 0
+ adc r15, rbp
+ ; Move bits 252-376 to own registers
+ mov rbx, 1152921504606846975
+ shld r15, r14, 4
+ shld r14, r13, 4
+ and r13, rbx
+ ; Sub product of top 2 words and order
+ ; * -5812631a5cf5d3ed
+ mov rbx, 12100500283911187475
+ mov rax, r14
+ mul rbx
+ mov r8, 0
+ add r10, rax
+ adc r11, rdx
+ adc r8, 0
+ mov rax, r15
+ mul rbx
+ add r11, rax
+ adc r8, rdx
+ ; * -14def9dea2f79cd7
+ mov rbx, 16942830013509034793
+ mov rax, r14
+ mul rbx
+ mov rbp, 0
+ add r11, rax
+ adc r12, rdx
+ adc rbp, 0
+ mov rax, r15
+ mul rbx
+ add r12, rax
+ adc rbp, rdx
+ ; Add overflows at 2 * 64
+ mov r9, 1152921504606846975
+ and r13, r9
+ add r12, r8
+ adc r13, rbp
+ ; Subtract top at 2 * 64
+ sub r12, r14
+ sbb r13, r15
+ sbb r9, r9
+ ; Conditional sub order
+ mov rax, 6346243789798364141
+ mov rdx, 1503914060200516822
+ mov r8, 1152921504606846976
+ and rax, r9
+ and rdx, r9
+ and r8, r9
+ add r10, rax
+ mov rax, 1152921504606846975
+ adc r11, rdx
+ adc r12, 0
+ adc r13, r8
+ and r13, rax
+ ; Store result
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ ret
+sc_muladd_x64 ENDP
+_TEXT ENDS
+; /* Non-constant time modular inversion.
+; *
+; * @param [out] r Resulting number.
+; * @param [in] a Number to invert.
+; * @return MP_OKAY on success.
+; */
+_TEXT SEGMENT READONLY PARA
+fe_invert_nct_x64 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ sub rsp, 513
+ mov r9, -19
+ mov r10, -1
+ mov r11, -1
+ mov r12, 9223372036854775807
+ mov r13, QWORD PTR [rdx]
+ mov r14, QWORD PTR [rdx+8]
+ mov r15, QWORD PTR [rdx+16]
+ mov rdi, QWORD PTR [rdx+24]
+ mov rsi, 0
+ test r13b, 1
+ jnz fe_invert_nct_v_even_end
+fe_invert_nct_v_even_start:
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shrd r15, rdi, 1
+ shr rdi, 1
+ mov BYTE PTR [rsp+rsi], 1
+ inc rsi
+ test r13b, 1
+ jz fe_invert_nct_v_even_start
+fe_invert_nct_v_even_end:
+L_fe_invert_nct_uv_start:
+ cmp r12, rdi
+ jb L_fe_invert_nct_uv_v
+ ja L_fe_invert_nct_uv_u
+ cmp r11, r15
+ jb L_fe_invert_nct_uv_v
+ ja L_fe_invert_nct_uv_u
+ cmp r10, r14
+ jb L_fe_invert_nct_uv_v
+ ja L_fe_invert_nct_uv_u
+ cmp r9, r13
+ jb L_fe_invert_nct_uv_v
+L_fe_invert_nct_uv_u:
+ mov BYTE PTR [rsp+rsi], 2
+ inc rsi
+ sub r9, r13
+ sbb r10, r14
+ sbb r11, r15
+ sbb r12, rdi
+ shrd r9, r10, 1
+ shrd r10, r11, 1
+ shrd r11, r12, 1
+ shr r12, 1
+ test r9b, 1
+ jnz fe_invert_nct_usubv_even_end
+fe_invert_nct_usubv_even_start:
+ shrd r9, r10, 1
+ shrd r10, r11, 1
+ shrd r11, r12, 1
+ shr r12, 1
+ mov BYTE PTR [rsp+rsi], 0
+ inc rsi
+ test r9b, 1
+ jz fe_invert_nct_usubv_even_start
+fe_invert_nct_usubv_even_end:
+ cmp r9, 1
+ jne L_fe_invert_nct_uv_start
+ mov rax, r10
+ or rax, r11
+ jne L_fe_invert_nct_uv_start
+ or rax, r12
+ jne L_fe_invert_nct_uv_start
+ mov r8b, 1
+ jmp L_fe_invert_nct_uv_end
+L_fe_invert_nct_uv_v:
+ mov BYTE PTR [rsp+rsi], 3
+ inc rsi
+ sub r13, r9
+ sbb r14, r10
+ sbb r15, r11
+ sbb rdi, r12
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shrd r15, rdi, 1
+ shr rdi, 1
+ test r13b, 1
+ jnz fe_invert_nct_vsubu_even_end
+fe_invert_nct_vsubu_even_start:
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shrd r15, rdi, 1
+ shr rdi, 1
+ mov BYTE PTR [rsp+rsi], 1
+ inc rsi
+ test r13b, 1
+ jz fe_invert_nct_vsubu_even_start
+fe_invert_nct_vsubu_even_end:
+ cmp r13, 1
+ jne L_fe_invert_nct_uv_start
+ mov rax, r14
+ or rax, r15
+ jne L_fe_invert_nct_uv_start
+ or rax, rdi
+ jne L_fe_invert_nct_uv_start
+ mov r8b, 0
+L_fe_invert_nct_uv_end:
+ mov r9, -19
+ mov r10, -1
+ mov r11, -1
+ mov r12, 9223372036854775807
+ mov r13, 1
+ xor r14, r14
+ xor r15, r15
+ xor rdi, rdi
+ mov BYTE PTR [rsp+rsi], 7
+ mov al, BYTE PTR [rsp]
+ mov rsi, 1
+ cmp al, 1
+ je L_fe_invert_nct_op_div2_d
+ jl L_fe_invert_nct_op_div2_b
+ cmp al, 3
+ je L_fe_invert_nct_op_d_sub_b
+ jl L_fe_invert_nct_op_b_sub_d
+ jmp L_fe_invert_nct_op_end
+L_fe_invert_nct_op_b_sub_d:
+ sub r9, r13
+ sbb r10, r14
+ sbb r11, r15
+ sbb r12, rdi
+ jnc L_fe_invert_nct_op_div2_b
+ mov rax, -1
+ add r9, -19
+ adc r10, rax
+ adc r11, rax
+ mov rax, 9223372036854775807
+ adc r12, rax
+L_fe_invert_nct_op_div2_b:
+ test r9b, 1
+ jz L_fe_invert_nct_op_div2_b_mod
+ add r9, -19
+ mov rax, -1
+ adc r10, rax
+ adc r11, rax
+ mov rax, 9223372036854775807
+ adc r12, rax
+L_fe_invert_nct_op_div2_b_mod:
+ shrd r9, r10, 1
+ shrd r10, r11, 1
+ shrd r11, r12, 1
+ shr r12, 1
+ mov al, BYTE PTR [rsp+rsi]
+ inc rsi
+ cmp al, 1
+ je L_fe_invert_nct_op_div2_d
+ jl L_fe_invert_nct_op_div2_b
+ cmp al, 3
+ je L_fe_invert_nct_op_d_sub_b
+ jl L_fe_invert_nct_op_b_sub_d
+ jmp L_fe_invert_nct_op_end
+L_fe_invert_nct_op_d_sub_b:
+ sub r13, r9
+ sbb r14, r10
+ sbb r15, r11
+ sbb rdi, r12
+ jnc L_fe_invert_nct_op_div2_d
+ mov rax, -1
+ add r13, -19
+ adc r14, rax
+ adc r15, rax
+ mov rax, 9223372036854775807
+ adc rdi, rax
+L_fe_invert_nct_op_div2_d:
+ test r13b, 1
+ jz L_fe_invert_nct_op_div2_d_mod
+ add r13, -19
+ mov rax, -1
+ adc r14, rax
+ adc r15, rax
+ mov rax, 9223372036854775807
+ adc rdi, rax
+L_fe_invert_nct_op_div2_d_mod:
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shrd r15, rdi, 1
+ shr rdi, 1
+ mov al, BYTE PTR [rsp+rsi]
+ inc rsi
+ cmp al, 1
+ je L_fe_invert_nct_op_div2_d
+ jl L_fe_invert_nct_op_div2_b
+ cmp al, 3
+ je L_fe_invert_nct_op_d_sub_b
+ jl L_fe_invert_nct_op_b_sub_d
+L_fe_invert_nct_op_end:
+ cmp r8b, 1
+ jne L_fe_invert_nct_store_d
+ mov QWORD PTR [rcx], r9
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+24], r12
+ jmp L_fe_invert_nct_store_end
+L_fe_invert_nct_store_d:
+ mov QWORD PTR [rcx], r13
+ mov QWORD PTR [rcx+8], r14
+ mov QWORD PTR [rcx+16], r15
+ mov QWORD PTR [rcx+24], rdi
+L_fe_invert_nct_store_end:
+ add rsp, 513
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_invert_nct_x64 ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_TEXT SEGMENT READONLY PARA
+fe_cmov_table_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov r9, rdx
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ xor rbx, rbx
+ movsx rax, r8b
+ cdq
+ xor al, dl
+ sub al, dl
+ mov bl, al
+ movd xmm7, ebx
+ mov rbx, 1
+ movd xmm9, rbx
+ vmovdqa ymm3, ymm9
+ vmovdqa ymm4, ymm9
+ vpxor ymm8, ymm8, ymm8
+ vpermd ymm7, ymm8, ymm7
+ vpermd ymm9, ymm8, ymm9
+ vpxor ymm0, ymm0, ymm0
+ vpxor ymm1, ymm1, ymm1
+ vpxor ymm2, ymm2, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpxor ymm5, ymm5, ymm5
+ vpand ymm3, ymm3, ymm6
+ vpand ymm4, ymm4, ymm6
+ vmovdqa ymm8, ymm9
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9]
+ vmovupd ymm1, YMMWORD PTR [r9+32]
+ vmovupd ymm2, YMMWORD PTR [r9+64]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+96]
+ vmovupd ymm1, YMMWORD PTR [r9+128]
+ vmovupd ymm2, YMMWORD PTR [r9+160]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+192]
+ vmovupd ymm1, YMMWORD PTR [r9+224]
+ vmovupd ymm2, YMMWORD PTR [r9+256]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+288]
+ vmovupd ymm1, YMMWORD PTR [r9+320]
+ vmovupd ymm2, YMMWORD PTR [r9+352]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+384]
+ vmovupd ymm1, YMMWORD PTR [r9+416]
+ vmovupd ymm2, YMMWORD PTR [r9+448]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+480]
+ vmovupd ymm1, YMMWORD PTR [r9+512]
+ vmovupd ymm2, YMMWORD PTR [r9+544]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+576]
+ vmovupd ymm1, YMMWORD PTR [r9+608]
+ vmovupd ymm2, YMMWORD PTR [r9+640]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ vpcmpeqd ymm6, ymm8, ymm7
+ vpaddd ymm8, ymm8, ymm9
+ vmovupd ymm0, YMMWORD PTR [r9+672]
+ vmovupd ymm1, YMMWORD PTR [r9+704]
+ vmovupd ymm2, YMMWORD PTR [r9+736]
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpand ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm0
+ vpor ymm4, ymm4, ymm1
+ vpor ymm5, ymm5, ymm2
+ movsx rax, r8b
+ sar rax, 63
+ vmovd xmm6, eax
+ vpxor ymm8, ymm8, ymm8
+ vpermd ymm6, ymm8, ymm6
+ vpxor ymm8, ymm3, ymm4
+ vpand ymm8, ymm8, ymm6
+ vpxor ymm3, ymm3, ymm8
+ vpxor ymm4, ymm4, ymm8
+ vmovupd YMMWORD PTR [rcx], ymm3
+ vmovupd YMMWORD PTR [rcx+32], ymm4
+ vmovupd YMMWORD PTR [rcx+64], ymm5
+ mov r10, QWORD PTR [rcx+64]
+ mov r11, QWORD PTR [rcx+72]
+ mov r12, QWORD PTR [rcx+80]
+ mov r13, QWORD PTR [rcx+88]
+ mov r14, -19
+ mov r15, -1
+ mov rdi, -1
+ mov rsi, 9223372036854775807
+ sub r14, r10
+ sbb r15, r11
+ sbb rdi, r12
+ sbb rsi, r13
+ cmp r8b, 0
+ cmovl r10, r14
+ cmovl r11, r15
+ cmovl r12, rdi
+ cmovl r13, rsi
+ mov QWORD PTR [rcx+64], r10
+ mov QWORD PTR [rcx+72], r11
+ mov QWORD PTR [rcx+80], r12
+ mov QWORD PTR [rcx+88], r13
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_cmov_table_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+ mov rbx, QWORD PTR [rsi]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rbp]
+ mulx r9, r8, rbx
+ ; A[2] * B[0]
+ mulx r11, r10, QWORD PTR [rsi+16]
+ ; A[1] * B[0]
+ mulx rcx, rax, QWORD PTR [rsi+8]
+ xor r15, r15
+ adcx r9, rax
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rbp+8]
+ mulx r13, r12, QWORD PTR [rsi+24]
+ adcx r10, rcx
+ ; A[0] * B[1]
+ mulx rcx, rax, rbx
+ adox r9, rax
+ ; A[2] * B[1]
+ mulx r14, rax, QWORD PTR [rsi+16]
+ adox r10, rcx
+ adcx r11, rax
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rbp+16]
+ mulx rcx, rax, QWORD PTR [rsi+8]
+ adcx r12, r14
+ adox r11, rax
+ adcx r13, r15
+ adox r12, rcx
+ ; A[0] * B[2]
+ mulx rcx, rax, rbx
+ adox r13, r15
+ xor r14, r14
+ adcx r10, rax
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rbp+8]
+ mulx rax, rdx, QWORD PTR [rsi+8]
+ adcx r11, rcx
+ adox r10, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rbp+24]
+ adox r11, rax
+ mulx rcx, rax, QWORD PTR [rsi+8]
+ adcx r12, rax
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rbp+16]
+ mulx rax, rdx, QWORD PTR [rsi+16]
+ adcx r13, rcx
+ adox r12, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rbp+24]
+ adox r13, rax
+ mulx rcx, rax, QWORD PTR [rsi+24]
+ adox r14, r15
+ adcx r14, rax
+ ; A[0] * B[3]
+ mulx rax, rdx, rbx
+ adcx r15, rcx
+ xor rcx, rcx
+ adcx r11, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsi+24]
+ adcx r12, rax
+ mulx rax, rdx, QWORD PTR [rbp]
+ adox r11, rdx
+ adox r12, rax
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsi+24]
+ mulx rax, rdx, QWORD PTR [rbp+16]
+ adcx r13, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rbp+24]
+ adcx r14, rax
+ mulx rdx, rax, QWORD PTR [rsi+16]
+ adcx r15, rcx
+ adox r13, rax
+ adox r14, rdx
+ adox r15, rcx
+ mov rdx, 38
+ mulx rax, r15, r15
+ add r11, r15
+ adc rax, 0
+ mov rcx, 9223372036854775807
+ shld rax, r11, 1
+ imul rax, rax, 19
+ and r11, rcx
+ xor rcx, rcx
+ adox r8, rax
+ mulx r12, rax, r12
+ adcx r8, rax
+ adox r9, r12
+ mulx r13, rax, r13
+ adcx r9, rax
+ adox r10, r13
+ mulx r14, rax, r14
+ adcx r10, rax
+ adox r11, r14
+ adcx r11, rcx
+ mov rcx, 9223372036854775807
+ mov rdx, r11
+ sar rdx, 63
+ and rdx, 19
+ and r11, rcx
+ add r8, rdx
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ ; Store
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+fe_mul_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sq_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ ; Square
+ mov rdx, QWORD PTR [rsi]
+ mov rax, QWORD PTR [rsi+8]
+ ; A[0] * A[1]
+ mov r15, rdx
+ mulx r10, r9, rax
+ ; A[0] * A[3]
+ mulx r12, r11, QWORD PTR [rsi+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsi+16]
+ mulx rbx, rcx, rax
+ xor r8, r8
+ adox r11, rcx
+ ; A[2] * A[3]
+ mulx r14, r13, QWORD PTR [rsi+24]
+ adox r12, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, r15
+ adox r13, r8
+ adcx r10, rcx
+ adox r14, r8
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsi+24]
+ adcx r11, rbx
+ adcx r12, rcx
+ adcx r13, rdx
+ adcx r14, r8
+ ; A[0] * A[0]
+ mov rdx, r15
+ mulx rcx, r8, rdx
+ xor r15, r15
+ adcx r9, r9
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r9, rcx
+ mulx rbx, rcx, rdx
+ adcx r10, r10
+ adox r10, rcx
+ adcx r11, r11
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsi+16]
+ adox r11, rbx
+ mulx rcx, rbx, rdx
+ adcx r12, r12
+ adox r12, rbx
+ adcx r13, r13
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsi+24]
+ adox r13, rcx
+ mulx rbx, rcx, rdx
+ adcx r14, r14
+ adox r14, rcx
+ adcx r15, r15
+ adox r15, rbx
+ mov rdx, 38
+ mulx rbx, r15, r15
+ add r11, r15
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r11, 1
+ imul rbx, rbx, 19
+ and r11, rcx
+ xor rcx, rcx
+ adox r8, rbx
+ mulx r12, rbx, r12
+ adcx r8, rbx
+ adox r9, r12
+ mulx r13, rbx, r13
+ adcx r9, rbx
+ adox r10, r13
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ adcx r11, rcx
+ mov rcx, 9223372036854775807
+ mov rdx, r11
+ sar rdx, 63
+ and rdx, 19
+ and r11, rcx
+ add r8, rdx
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ ; Store
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+fe_sq_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_sq_n_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+L_fe_sq_n_avx2:
+ ; Square
+ mov rdx, QWORD PTR [rsi]
+ mov rax, QWORD PTR [rsi+8]
+ ; A[0] * A[1]
+ mov r15, rdx
+ mulx r10, r9, rax
+ ; A[0] * A[3]
+ mulx r12, r11, QWORD PTR [rsi+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsi+16]
+ mulx rbx, rcx, rax
+ xor r8, r8
+ adox r11, rcx
+ ; A[2] * A[3]
+ mulx r14, r13, QWORD PTR [rsi+24]
+ adox r12, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, r15
+ adox r13, r8
+ adcx r10, rcx
+ adox r14, r8
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsi+24]
+ adcx r11, rbx
+ adcx r12, rcx
+ adcx r13, rdx
+ adcx r14, r8
+ ; A[0] * A[0]
+ mov rdx, r15
+ mulx rcx, r8, rdx
+ xor r15, r15
+ adcx r9, r9
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r9, rcx
+ mulx rbx, rcx, rdx
+ adcx r10, r10
+ adox r10, rcx
+ adcx r11, r11
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsi+16]
+ adox r11, rbx
+ mulx rcx, rbx, rdx
+ adcx r12, r12
+ adox r12, rbx
+ adcx r13, r13
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsi+24]
+ adox r13, rcx
+ mulx rbx, rcx, rdx
+ adcx r14, r14
+ adox r14, rcx
+ adcx r15, r15
+ adox r15, rbx
+ mov rdx, 38
+ mulx rbx, r15, r15
+ add r11, r15
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r11, 1
+ imul rbx, rbx, 19
+ and r11, rcx
+ xor rcx, rcx
+ adox r8, rbx
+ mulx r12, rbx, r12
+ adcx r8, rbx
+ adox r9, r12
+ mulx r13, rbx, r13
+ adcx r9, rbx
+ adox r10, r13
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ adcx r11, rcx
+ ; Store
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ dec bpl
+ jnz L_fe_sq_n_avx2
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+fe_sq_n_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_mul121666_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov rax, rdx
+ mov rdx, 121666
+ mulx r15, r8, QWORD PTR [rax]
+ mulx r14, r9, QWORD PTR [rax+8]
+ mulx r13, r10, QWORD PTR [rax+16]
+ add r9, r15
+ mulx r12, r11, QWORD PTR [rax+24]
+ adc r10, r14
+ adc r11, r13
+ adc r12, 0
+ shld r12, r11, 1
+ btr r11, 63
+ imul r12, r12, 19
+ add r8, r12
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ mov QWORD PTR [rcx], r8
+ mov QWORD PTR [rcx+8], r9
+ mov QWORD PTR [rcx+16], r10
+ mov QWORD PTR [rcx+24], r11
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_mul121666_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_invert_avx2 PROC
+ sub rsp, 144
+ ; Invert
+ mov QWORD PTR [rsp+128], rcx
+ mov QWORD PTR [rsp+136], rdx
+ mov rcx, rsp
+ mov rdx, QWORD PTR [rsp+136]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, QWORD PTR [rsp+136]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ mov rcx, rsp
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 19
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 99
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 4
+ call fe_sq_n_avx2
+ mov rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ mov rdx, QWORD PTR [rsp+136]
+ mov rcx, QWORD PTR [rsp+128]
+ add rsp, 144
+ ret
+fe_invert_avx2 ENDP
+_TEXT ENDS
+IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519
+_DATA SEGMENT
+ALIGN 16
+L_curve25519_base_avx2_x2 QWORD 5cae469cdd684efbh, 8f3f5ced1e350b5ch
+ QWORD 0d9750c687d157114h, 20d342d51873f1b7h
+ptr_L_curve25519_base_avx2_x2 QWORD L_curve25519_base_avx2_x2
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+curve25519_base_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov r8, rcx
+ mov r9, rdx
+ sub rsp, 176
+ mov QWORD PTR [rsp+168], 0
+ mov QWORD PTR [rsp+160], r8
+ ; Set base point x
+ mov QWORD PTR [r8], 9
+ mov QWORD PTR [r8+8], 0
+ mov QWORD PTR [r8+16], 0
+ mov QWORD PTR [r8+24], 0
+ ; Set one
+ mov QWORD PTR [rsp], 1
+ mov QWORD PTR [rsp+8], 0
+ mov QWORD PTR [rsp+16], 0
+ mov QWORD PTR [rsp+24], 0
+ mov r10, QWORD PTR [ptr_L_curve25519_base_avx2_x2]
+ mov r11, QWORD PTR [ptr_L_curve25519_base_avx2_x2+8]
+ mov r12, QWORD PTR [ptr_L_curve25519_base_avx2_x2+16]
+ mov r13, QWORD PTR [ptr_L_curve25519_base_avx2_x2+24]
+ ; Set one
+ mov QWORD PTR [rsp+32], 1
+ mov QWORD PTR [rsp+40], 0
+ mov QWORD PTR [rsp+48], 0
+ mov QWORD PTR [rsp+56], 0
+ mov QWORD PTR [rsp+64], r10
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ mov rbp, 253
+L_curve25519_base_avx2_bits:
+ mov rax, QWORD PTR [rsp+168]
+ mov rbx, rbp
+ mov rcx, rbp
+ shr rbx, 6
+ and rcx, 63
+ mov rbx, QWORD PTR [r9+8*rbx]
+ shr rbx, cl
+ and rbx, 1
+ xor rax, rbx
+ neg rax
+ ; Conditional Swap
+ mov r10, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ mov r12, QWORD PTR [r8+16]
+ mov r13, QWORD PTR [r8+24]
+ mov r14, QWORD PTR [rsp]
+ mov r15, QWORD PTR [rsp+8]
+ mov rdi, QWORD PTR [rsp+16]
+ mov rsi, QWORD PTR [rsp+24]
+ xor r10, QWORD PTR [rsp+64]
+ xor r11, QWORD PTR [rsp+72]
+ xor r12, QWORD PTR [rsp+80]
+ xor r13, QWORD PTR [rsp+88]
+ xor r14, QWORD PTR [rsp+32]
+ xor r15, QWORD PTR [rsp+40]
+ xor rdi, QWORD PTR [rsp+48]
+ xor rsi, QWORD PTR [rsp+56]
+ and r10, rax
+ and r11, rax
+ and r12, rax
+ and r13, rax
+ and r14, rax
+ and r15, rax
+ and rdi, rax
+ and rsi, rax
+ xor QWORD PTR [r8], r10
+ xor QWORD PTR [r8+8], r11
+ xor QWORD PTR [r8+16], r12
+ xor QWORD PTR [r8+24], r13
+ xor QWORD PTR [rsp], r14
+ xor QWORD PTR [rsp+8], r15
+ xor QWORD PTR [rsp+16], rdi
+ xor QWORD PTR [rsp+24], rsi
+ xor QWORD PTR [rsp+64], r10
+ xor QWORD PTR [rsp+72], r11
+ xor QWORD PTR [rsp+80], r12
+ xor QWORD PTR [rsp+88], r13
+ xor QWORD PTR [rsp+32], r14
+ xor QWORD PTR [rsp+40], r15
+ xor QWORD PTR [rsp+48], rdi
+ xor QWORD PTR [rsp+56], rsi
+ mov QWORD PTR [rsp+168], rbx
+ ; Add-Sub
+ ; Add
+ mov r10, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ mov r12, QWORD PTR [r8+16]
+ mov r13, QWORD PTR [r8+24]
+ mov r14, r10
+ add r10, QWORD PTR [rsp]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+8]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+16]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r13, 1
+ imul rbx, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add r10, rbx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp]
+ sbb r15, QWORD PTR [rsp+8]
+ sbb rdi, QWORD PTR [rsp+16]
+ sbb rsi, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rsi, 1
+ imul rbx, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbx
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [r8], r10
+ mov QWORD PTR [r8+8], r11
+ mov QWORD PTR [r8+16], r12
+ mov QWORD PTR [r8+24], r13
+ mov QWORD PTR [rsp+128], r14
+ mov QWORD PTR [rsp+136], r15
+ mov QWORD PTR [rsp+144], rdi
+ mov QWORD PTR [rsp+152], rsi
+ ; Add-Sub
+ ; Add
+ mov r10, QWORD PTR [rsp+64]
+ mov r11, QWORD PTR [rsp+72]
+ mov r12, QWORD PTR [rsp+80]
+ mov r13, QWORD PTR [rsp+88]
+ mov r14, r10
+ add r10, QWORD PTR [rsp+32]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+40]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+48]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r13, 1
+ imul rbx, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add r10, rbx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp+32]
+ sbb r15, QWORD PTR [rsp+40]
+ sbb rdi, QWORD PTR [rsp+48]
+ sbb rsi, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rsi, 1
+ imul rbx, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbx
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [rsp+32], r10
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ mov QWORD PTR [rsp+96], r14
+ mov QWORD PTR [rsp+104], r15
+ mov QWORD PTR [rsp+112], rdi
+ mov QWORD PTR [rsp+120], rsi
+ mov rax, QWORD PTR [rsp+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+128]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+48]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx r15, r14, QWORD PTR [rsp+56]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+48]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rcx, rdx, QWORD PTR [rsp+40]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rcx, rdx, QWORD PTR [rsp+48]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+56]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+56]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+128]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+56]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+48]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [rsp+32], r10
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ mov rax, QWORD PTR [rsp+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+112]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r15, r14, QWORD PTR [rsp+120]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+112]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rcx, rdx, QWORD PTR [rsp+104]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+120]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [r8]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+120]
+ mulx rcx, rdx, QWORD PTR [r8+16]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+112]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [rsp], r10
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ ; Square
+ mov rdx, QWORD PTR [rsp+128]
+ mov rax, QWORD PTR [rsp+136]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [rsp+152]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [rsp+152]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+152]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+144]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+96], r10
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Square
+ mov rdx, QWORD PTR [r8]
+ mov rax, QWORD PTR [r8+8]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [r8+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [r8+16]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [r8+24]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [r8+24]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [r8+16]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+128], r10
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Add-Sub
+ ; Add
+ mov r10, QWORD PTR [rsp]
+ mov r11, QWORD PTR [rsp+8]
+ mov r12, QWORD PTR [rsp+16]
+ mov r13, QWORD PTR [rsp+24]
+ mov r14, r10
+ add r10, QWORD PTR [rsp+32]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+40]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+48]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r13, 1
+ imul rbx, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add r10, rbx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp+32]
+ sbb r15, QWORD PTR [rsp+40]
+ sbb rdi, QWORD PTR [rsp+48]
+ sbb rsi, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rsi, 1
+ imul rbx, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbx
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [rsp+64], r10
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ mov QWORD PTR [rsp+32], r14
+ mov QWORD PTR [rsp+40], r15
+ mov QWORD PTR [rsp+48], rdi
+ mov QWORD PTR [rsp+56], rsi
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+144]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [r8], r10
+ mov QWORD PTR [r8+8], r11
+ mov QWORD PTR [r8+16], r12
+ mov QWORD PTR [r8+24], r13
+ ; Sub
+ mov r10, QWORD PTR [rsp+128]
+ mov r11, QWORD PTR [rsp+136]
+ mov r12, QWORD PTR [rsp+144]
+ mov r13, QWORD PTR [rsp+152]
+ sub r10, QWORD PTR [rsp+96]
+ sbb r11, QWORD PTR [rsp+104]
+ sbb r12, QWORD PTR [rsp+112]
+ sbb r13, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r13, 1
+ imul rbx, -19
+ btr r13, 63
+ ; Add modulus (if underflow)
+ sub r10, rbx
+ sbb r11, 0
+ sbb r12, 0
+ sbb r13, 0
+ mov QWORD PTR [rsp+128], r10
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ ; Square
+ mov rdx, QWORD PTR [rsp+32]
+ mov rax, QWORD PTR [rsp+40]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [rsp+56]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+48]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [rsp+56]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+56]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+48]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+56]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+32], r10
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ ; Square
+ mov rdx, QWORD PTR [rsp+64]
+ mov rax, QWORD PTR [rsp+72]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [rsp+88]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+80]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [rsp+88]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+88]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+80]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+88]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+64], r10
+ mov QWORD PTR [rsp+72], r11
+ mov QWORD PTR [rsp+80], r12
+ mov QWORD PTR [rsp+88], r13
+ mov rdx, 121666
+ mulx rsi, r10, QWORD PTR [rsp+128]
+ mulx rdi, r11, QWORD PTR [rsp+136]
+ mulx r15, r12, QWORD PTR [rsp+144]
+ add r11, rsi
+ mulx r14, r13, QWORD PTR [rsp+152]
+ adc r12, rdi
+ adc r13, r15
+ adc r14, 0
+ add r10, QWORD PTR [rsp+96]
+ adc r11, QWORD PTR [rsp+104]
+ adc r12, QWORD PTR [rsp+112]
+ adc r13, QWORD PTR [rsp+120]
+ adc r14, 0
+ shld r14, r13, 1
+ btr r13, 63
+ imul r14, r14, 19
+ add r10, r14
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rsp+96], r10
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ mov rdx, 9
+ mulx rsi, r10, QWORD PTR [rsp+32]
+ mulx rdi, r11, QWORD PTR [rsp+40]
+ mulx r15, r12, QWORD PTR [rsp+48]
+ add r11, rsi
+ mulx r14, r13, QWORD PTR [rsp+56]
+ adc r12, rdi
+ adc r13, r15
+ adc r14, 0
+ shld r14, r13, 1
+ btr r13, 63
+ imul r14, r14, 19
+ add r10, r14
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rsp+32], r10
+ mov QWORD PTR [rsp+40], r11
+ mov QWORD PTR [rsp+48], r12
+ mov QWORD PTR [rsp+56], r13
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+144]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [rsp], r10
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ dec rbp
+ cmp rbp, 3
+ jge L_curve25519_base_avx2_bits
+ mov rax, QWORD PTR [rsp+168]
+ neg rax
+ ; Conditional Swap
+ mov r10, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ mov r12, QWORD PTR [r8+16]
+ mov r13, QWORD PTR [r8+24]
+ mov r14, QWORD PTR [rsp]
+ mov r15, QWORD PTR [rsp+8]
+ mov rdi, QWORD PTR [rsp+16]
+ mov rsi, QWORD PTR [rsp+24]
+ xor r10, QWORD PTR [rsp+64]
+ xor r11, QWORD PTR [rsp+72]
+ xor r12, QWORD PTR [rsp+80]
+ xor r13, QWORD PTR [rsp+88]
+ xor r14, QWORD PTR [rsp+32]
+ xor r15, QWORD PTR [rsp+40]
+ xor rdi, QWORD PTR [rsp+48]
+ xor rsi, QWORD PTR [rsp+56]
+ and r10, rax
+ and r11, rax
+ and r12, rax
+ and r13, rax
+ and r14, rax
+ and r15, rax
+ and rdi, rax
+ and rsi, rax
+ xor QWORD PTR [r8], r10
+ xor QWORD PTR [r8+8], r11
+ xor QWORD PTR [r8+16], r12
+ xor QWORD PTR [r8+24], r13
+ xor QWORD PTR [rsp], r14
+ xor QWORD PTR [rsp+8], r15
+ xor QWORD PTR [rsp+16], rdi
+ xor QWORD PTR [rsp+24], rsi
+ xor QWORD PTR [rsp+64], r10
+ xor QWORD PTR [rsp+72], r11
+ xor QWORD PTR [rsp+80], r12
+ xor QWORD PTR [rsp+88], r13
+ xor QWORD PTR [rsp+32], r14
+ xor QWORD PTR [rsp+40], r15
+ xor QWORD PTR [rsp+48], rdi
+ xor QWORD PTR [rsp+56], rsi
+L_curve25519_base_avx2_last_3:
+ ; Add-Sub
+ ; Add
+ mov r10, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ mov r12, QWORD PTR [r8+16]
+ mov r13, QWORD PTR [r8+24]
+ mov r14, r10
+ add r10, QWORD PTR [rsp]
+ mov r15, r11
+ adc r11, QWORD PTR [rsp+8]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+16]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r13, 1
+ imul rbx, 19
+ btr r13, 63
+ ; Sub modulus (if overflow)
+ add r10, rbx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ ; Sub
+ sub r14, QWORD PTR [rsp]
+ sbb r15, QWORD PTR [rsp+8]
+ sbb rdi, QWORD PTR [rsp+16]
+ sbb rsi, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rsi, 1
+ imul rbx, -19
+ btr rsi, 63
+ ; Add modulus (if underflow)
+ sub r14, rbx
+ sbb r15, 0
+ sbb rdi, 0
+ sbb rsi, 0
+ mov QWORD PTR [r8], r10
+ mov QWORD PTR [r8+8], r11
+ mov QWORD PTR [r8+16], r12
+ mov QWORD PTR [r8+24], r13
+ mov QWORD PTR [rsp+128], r14
+ mov QWORD PTR [rsp+136], r15
+ mov QWORD PTR [rsp+144], rdi
+ mov QWORD PTR [rsp+152], rsi
+ ; Square
+ mov rdx, QWORD PTR [rsp+128]
+ mov rax, QWORD PTR [rsp+136]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [rsp+152]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [rsp+152]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+152]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+144]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+96], r10
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ ; Square
+ mov rdx, QWORD PTR [r8]
+ mov rax, QWORD PTR [r8+8]
+ ; A[0] * A[1]
+ mov rsi, rdx
+ mulx r12, r11, rax
+ ; A[0] * A[3]
+ mulx r14, r13, QWORD PTR [r8+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [r8+16]
+ mulx rbx, rcx, rax
+ xor r10, r10
+ adox r13, rcx
+ ; A[2] * A[3]
+ mulx rdi, r15, QWORD PTR [r8+24]
+ adox r14, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rsi
+ adox r15, r10
+ adcx r12, rcx
+ adox rdi, r10
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [r8+24]
+ adcx r13, rbx
+ adcx r14, rcx
+ adcx r15, rdx
+ adcx rdi, r10
+ ; A[0] * A[0]
+ mov rdx, rsi
+ mulx rcx, r10, rdx
+ xor rsi, rsi
+ adcx r11, r11
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r11, rcx
+ mulx rbx, rcx, rdx
+ adcx r12, r12
+ adox r12, rcx
+ adcx r13, r13
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [r8+16]
+ adox r13, rbx
+ mulx rcx, rbx, rdx
+ adcx r14, r14
+ adox r14, rbx
+ adcx r15, r15
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, rcx
+ mulx rbx, rcx, rdx
+ adcx rdi, rdi
+ adox rdi, rcx
+ adcx rsi, rsi
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rbx, rsi, rsi
+ add r13, rsi
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r13, 1
+ imul rbx, rbx, 19
+ and r13, rcx
+ xor rcx, rcx
+ adox r10, rbx
+ mulx r14, rbx, r14
+ adcx r10, rbx
+ adox r11, r14
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ adcx r13, rcx
+ ; Store
+ mov QWORD PTR [rsp+128], r10
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+144]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [r8], r10
+ mov QWORD PTR [r8+8], r11
+ mov QWORD PTR [r8+16], r12
+ mov QWORD PTR [r8+24], r13
+ ; Sub
+ mov r10, QWORD PTR [rsp+128]
+ mov r11, QWORD PTR [rsp+136]
+ mov r12, QWORD PTR [rsp+144]
+ mov r13, QWORD PTR [rsp+152]
+ sub r10, QWORD PTR [rsp+96]
+ sbb r11, QWORD PTR [rsp+104]
+ sbb r12, QWORD PTR [rsp+112]
+ sbb r13, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r13, 1
+ imul rbx, -19
+ btr r13, 63
+ ; Add modulus (if underflow)
+ sub r10, rbx
+ sbb r11, 0
+ sbb r12, 0
+ sbb r13, 0
+ mov QWORD PTR [rsp+128], r10
+ mov QWORD PTR [rsp+136], r11
+ mov QWORD PTR [rsp+144], r12
+ mov QWORD PTR [rsp+152], r13
+ mov rdx, 121666
+ mulx rsi, r10, QWORD PTR [rsp+128]
+ mulx rdi, r11, QWORD PTR [rsp+136]
+ mulx r15, r12, QWORD PTR [rsp+144]
+ add r11, rsi
+ mulx r14, r13, QWORD PTR [rsp+152]
+ adc r12, rdi
+ adc r13, r15
+ adc r14, 0
+ add r10, QWORD PTR [rsp+96]
+ adc r11, QWORD PTR [rsp+104]
+ adc r12, QWORD PTR [rsp+112]
+ adc r13, QWORD PTR [rsp+120]
+ adc r14, 0
+ shld r14, r13, 1
+ btr r13, 63
+ imul r14, r14, 19
+ add r10, r14
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov QWORD PTR [rsp+96], r10
+ mov QWORD PTR [rsp+104], r11
+ mov QWORD PTR [rsp+112], r12
+ mov QWORD PTR [rsp+120], r13
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [rsp+144]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ ; Store
+ mov QWORD PTR [rsp], r10
+ mov QWORD PTR [rsp+8], r11
+ mov QWORD PTR [rsp+16], r12
+ mov QWORD PTR [rsp+24], r13
+ dec rbp
+ jge L_curve25519_base_avx2_last_3
+ ; Invert
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 4
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 19
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 99
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ mov r8, QWORD PTR [rsp+160]
+ mov rax, QWORD PTR [r8]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp]
+ mulx r11, r10, rax
+ ; A[2] * B[0]
+ mulx r13, r12, QWORD PTR [r8+16]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ xor rsi, rsi
+ adcx r11, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+8]
+ mulx r15, r14, QWORD PTR [r8+24]
+ adcx r12, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r11, rcx
+ ; A[2] * B[1]
+ mulx rdi, rcx, QWORD PTR [r8+16]
+ adox r12, rbx
+ adcx r13, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+16]
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ adcx r14, rdi
+ adox r13, rcx
+ adcx r15, rsi
+ adox r14, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox r15, rsi
+ xor rdi, rdi
+ adcx r12, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+8]
+ mulx rcx, rdx, QWORD PTR [r8+8]
+ adcx r13, rbx
+ adox r12, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adox r13, rcx
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ adcx r14, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+16]
+ mulx rcx, rdx, QWORD PTR [r8+16]
+ adcx r15, rbx
+ adox r14, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adox r15, rcx
+ mulx rbx, rcx, QWORD PTR [r8+24]
+ adox rdi, rsi
+ adcx rdi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rsi, rbx
+ xor rbx, rbx
+ adcx r13, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r8+24]
+ adcx r14, rcx
+ mulx rcx, rdx, QWORD PTR [rsp]
+ adox r13, rdx
+ adox r14, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r8+24]
+ mulx rcx, rdx, QWORD PTR [rsp+16]
+ adcx r15, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adcx rdi, rcx
+ mulx rdx, rcx, QWORD PTR [r8+16]
+ adcx rsi, rbx
+ adox r15, rcx
+ adox rdi, rdx
+ adox rsi, rbx
+ mov rdx, 38
+ mulx rcx, rsi, rsi
+ add r13, rsi
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r13, 1
+ imul rcx, rcx, 19
+ and r13, rbx
+ xor rbx, rbx
+ adox r10, rcx
+ mulx r14, rcx, r14
+ adcx r10, rcx
+ adox r11, r14
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ adcx r13, rbx
+ mov rbx, 9223372036854775807
+ mov rdx, r13
+ sar rdx, 63
+ and rdx, 19
+ and r13, rbx
+ add r10, rdx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ mov rcx, 9223372036854775807
+ mov rdx, r10
+ add rdx, 19
+ mov rdx, r11
+ adc rdx, 0
+ mov rdx, r12
+ adc rdx, 0
+ mov rdx, r13
+ adc rdx, 0
+ sar rdx, 63
+ and rdx, 19
+ and r13, rcx
+ add r10, rdx
+ adc r11, 0
+ adc r12, 0
+ adc r13, 0
+ and r13, rcx
+ ; Store
+ mov QWORD PTR [r8], r10
+ mov QWORD PTR [r8+8], r11
+ mov QWORD PTR [r8+16], r12
+ mov QWORD PTR [r8+24], r13
+ xor rax, rax
+ add rsp, 176
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+curve25519_base_avx2 ENDP
+_TEXT ENDS
+ENDIF
+_TEXT SEGMENT READONLY PARA
+curve25519_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov r9, rcx
+ mov r10, rdx
+ sub rsp, 184
+ mov QWORD PTR [rsp+176], 0
+ mov QWORD PTR [rsp+168], r9
+ ; Set one
+ mov QWORD PTR [r9], 1
+ mov QWORD PTR [r9+8], 0
+ mov QWORD PTR [r9+16], 0
+ mov QWORD PTR [r9+24], 0
+ ; Set zero
+ mov QWORD PTR [rsp], 0
+ mov QWORD PTR [rsp+8], 0
+ mov QWORD PTR [rsp+16], 0
+ mov QWORD PTR [rsp+24], 0
+ ; Set one
+ mov QWORD PTR [rsp+32], 1
+ mov QWORD PTR [rsp+40], 0
+ mov QWORD PTR [rsp+48], 0
+ mov QWORD PTR [rsp+56], 0
+ ; Copy
+ mov r11, QWORD PTR [r8]
+ mov r12, QWORD PTR [r8+8]
+ mov r13, QWORD PTR [r8+16]
+ mov r14, QWORD PTR [r8+24]
+ mov QWORD PTR [rsp+64], r11
+ mov QWORD PTR [rsp+72], r12
+ mov QWORD PTR [rsp+80], r13
+ mov QWORD PTR [rsp+88], r14
+ mov rbx, 254
+L_curve25519_avx2_bits:
+ mov QWORD PTR [rsp+160], rbx
+ mov rcx, rbx
+ mov rax, QWORD PTR [rsp+176]
+ and rcx, 63
+ shr rbx, 6
+ mov rbx, QWORD PTR [r10+8*rbx]
+ shr rbx, cl
+ and rbx, 1
+ xor rax, rbx
+ mov QWORD PTR [rsp+176], rbx
+ neg rax
+ ; Conditional Swap
+ mov r11, QWORD PTR [r9]
+ mov r12, QWORD PTR [r9+8]
+ mov r13, QWORD PTR [r9+16]
+ mov r14, QWORD PTR [r9+24]
+ mov r15, QWORD PTR [rsp]
+ mov rdi, QWORD PTR [rsp+8]
+ mov rsi, QWORD PTR [rsp+16]
+ mov rbp, QWORD PTR [rsp+24]
+ xor r11, QWORD PTR [rsp+64]
+ xor r12, QWORD PTR [rsp+72]
+ xor r13, QWORD PTR [rsp+80]
+ xor r14, QWORD PTR [rsp+88]
+ xor r15, QWORD PTR [rsp+32]
+ xor rdi, QWORD PTR [rsp+40]
+ xor rsi, QWORD PTR [rsp+48]
+ xor rbp, QWORD PTR [rsp+56]
+ and r11, rax
+ and r12, rax
+ and r13, rax
+ and r14, rax
+ and r15, rax
+ and rdi, rax
+ and rsi, rax
+ and rbp, rax
+ xor QWORD PTR [r9], r11
+ xor QWORD PTR [r9+8], r12
+ xor QWORD PTR [r9+16], r13
+ xor QWORD PTR [r9+24], r14
+ xor QWORD PTR [rsp], r15
+ xor QWORD PTR [rsp+8], rdi
+ xor QWORD PTR [rsp+16], rsi
+ xor QWORD PTR [rsp+24], rbp
+ xor QWORD PTR [rsp+64], r11
+ xor QWORD PTR [rsp+72], r12
+ xor QWORD PTR [rsp+80], r13
+ xor QWORD PTR [rsp+88], r14
+ xor QWORD PTR [rsp+32], r15
+ xor QWORD PTR [rsp+40], rdi
+ xor QWORD PTR [rsp+48], rsi
+ xor QWORD PTR [rsp+56], rbp
+ ; Add-Sub
+ ; Add
+ mov r11, QWORD PTR [r9]
+ mov r12, QWORD PTR [r9+8]
+ mov r13, QWORD PTR [r9+16]
+ mov r14, QWORD PTR [r9+24]
+ mov r15, r11
+ add r11, QWORD PTR [rsp]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+8]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+16]
+ mov rbp, r14
+ adc r14, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r14, 1
+ imul rbx, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, rbx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Sub
+ sub r15, QWORD PTR [rsp]
+ sbb rdi, QWORD PTR [rsp+8]
+ sbb rsi, QWORD PTR [rsp+16]
+ sbb rbp, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rbp, 1
+ imul rbx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub r15, rbx
+ sbb rdi, 0
+ sbb rsi, 0
+ sbb rbp, 0
+ mov QWORD PTR [r9], r11
+ mov QWORD PTR [r9+8], r12
+ mov QWORD PTR [r9+16], r13
+ mov QWORD PTR [r9+24], r14
+ mov QWORD PTR [rsp+128], r15
+ mov QWORD PTR [rsp+136], rdi
+ mov QWORD PTR [rsp+144], rsi
+ mov QWORD PTR [rsp+152], rbp
+ ; Add-Sub
+ ; Add
+ mov r11, QWORD PTR [rsp+64]
+ mov r12, QWORD PTR [rsp+72]
+ mov r13, QWORD PTR [rsp+80]
+ mov r14, QWORD PTR [rsp+88]
+ mov r15, r11
+ add r11, QWORD PTR [rsp+32]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+40]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+48]
+ mov rbp, r14
+ adc r14, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r14, 1
+ imul rbx, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, rbx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Sub
+ sub r15, QWORD PTR [rsp+32]
+ sbb rdi, QWORD PTR [rsp+40]
+ sbb rsi, QWORD PTR [rsp+48]
+ sbb rbp, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rbp, 1
+ imul rbx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub r15, rbx
+ sbb rdi, 0
+ sbb rsi, 0
+ sbb rbp, 0
+ mov QWORD PTR [rsp+32], r11
+ mov QWORD PTR [rsp+40], r12
+ mov QWORD PTR [rsp+48], r13
+ mov QWORD PTR [rsp+56], r14
+ mov QWORD PTR [rsp+96], r15
+ mov QWORD PTR [rsp+104], rdi
+ mov QWORD PTR [rsp+112], rsi
+ mov QWORD PTR [rsp+120], rbp
+ mov rax, QWORD PTR [rsp+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+128]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+48]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rdi, r15, QWORD PTR [rsp+56]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+48]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rcx, rdx, QWORD PTR [rsp+40]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+40]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rcx, rdx, QWORD PTR [rsp+48]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+56]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+56]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+128]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+56]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+48]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [rsp+32], r11
+ mov QWORD PTR [rsp+40], r12
+ mov QWORD PTR [rsp+48], r13
+ mov QWORD PTR [rsp+56], r14
+ mov rax, QWORD PTR [rsp+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r9]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+112]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r9+8]
+ mulx rdi, r15, QWORD PTR [rsp+120]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+112]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r9+16]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r9+8]
+ mulx rcx, rdx, QWORD PTR [rsp+104]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r9+24]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r9+16]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r9+24]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+120]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [r9]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+120]
+ mulx rcx, rdx, QWORD PTR [r9+16]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+112]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [rsp], r11
+ mov QWORD PTR [rsp+8], r12
+ mov QWORD PTR [rsp+16], r13
+ mov QWORD PTR [rsp+24], r14
+ ; Square
+ mov rdx, QWORD PTR [rsp+128]
+ mov rax, QWORD PTR [rsp+136]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [rsp+152]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+152]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+144]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+96], r11
+ mov QWORD PTR [rsp+104], r12
+ mov QWORD PTR [rsp+112], r13
+ mov QWORD PTR [rsp+120], r14
+ ; Square
+ mov rdx, QWORD PTR [r9]
+ mov rax, QWORD PTR [r9+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [r9+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [r9+16]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [r9+24]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [r9+16]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [r9+24]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+128], r11
+ mov QWORD PTR [rsp+136], r12
+ mov QWORD PTR [rsp+144], r13
+ mov QWORD PTR [rsp+152], r14
+ ; Add-Sub
+ ; Add
+ mov r11, QWORD PTR [rsp]
+ mov r12, QWORD PTR [rsp+8]
+ mov r13, QWORD PTR [rsp+16]
+ mov r14, QWORD PTR [rsp+24]
+ mov r15, r11
+ add r11, QWORD PTR [rsp+32]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+40]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+48]
+ mov rbp, r14
+ adc r14, QWORD PTR [rsp+56]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r14, 1
+ imul rbx, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, rbx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Sub
+ sub r15, QWORD PTR [rsp+32]
+ sbb rdi, QWORD PTR [rsp+40]
+ sbb rsi, QWORD PTR [rsp+48]
+ sbb rbp, QWORD PTR [rsp+56]
+ sbb rbx, rbx
+ shld rbx, rbp, 1
+ imul rbx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub r15, rbx
+ sbb rdi, 0
+ sbb rsi, 0
+ sbb rbp, 0
+ mov QWORD PTR [rsp+64], r11
+ mov QWORD PTR [rsp+72], r12
+ mov QWORD PTR [rsp+80], r13
+ mov QWORD PTR [rsp+88], r14
+ mov QWORD PTR [rsp+32], r15
+ mov QWORD PTR [rsp+40], rdi
+ mov QWORD PTR [rsp+48], rsi
+ mov QWORD PTR [rsp+56], rbp
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rdi, r15, QWORD PTR [rsp+152]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+144]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [r9], r11
+ mov QWORD PTR [r9+8], r12
+ mov QWORD PTR [r9+16], r13
+ mov QWORD PTR [r9+24], r14
+ ; Sub
+ mov r11, QWORD PTR [rsp+128]
+ mov r12, QWORD PTR [rsp+136]
+ mov r13, QWORD PTR [rsp+144]
+ mov r14, QWORD PTR [rsp+152]
+ sub r11, QWORD PTR [rsp+96]
+ sbb r12, QWORD PTR [rsp+104]
+ sbb r13, QWORD PTR [rsp+112]
+ sbb r14, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r14, 1
+ imul rbx, -19
+ btr r14, 63
+ ; Add modulus (if underflow)
+ sub r11, rbx
+ sbb r12, 0
+ sbb r13, 0
+ sbb r14, 0
+ mov QWORD PTR [rsp+128], r11
+ mov QWORD PTR [rsp+136], r12
+ mov QWORD PTR [rsp+144], r13
+ mov QWORD PTR [rsp+152], r14
+ ; Square
+ mov rdx, QWORD PTR [rsp+32]
+ mov rax, QWORD PTR [rsp+40]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [rsp+56]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+48]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [rsp+56]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+56]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+48]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+56]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+32], r11
+ mov QWORD PTR [rsp+40], r12
+ mov QWORD PTR [rsp+48], r13
+ mov QWORD PTR [rsp+56], r14
+ ; Square
+ mov rdx, QWORD PTR [rsp+64]
+ mov rax, QWORD PTR [rsp+72]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [rsp+88]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+80]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [rsp+88]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+88]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+80]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+88]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+64], r11
+ mov QWORD PTR [rsp+72], r12
+ mov QWORD PTR [rsp+80], r13
+ mov QWORD PTR [rsp+88], r14
+ mov rdx, 121666
+ mulx rbp, r11, QWORD PTR [rsp+128]
+ mulx rsi, r12, QWORD PTR [rsp+136]
+ mulx rdi, r13, QWORD PTR [rsp+144]
+ add r12, rbp
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adc r13, rsi
+ adc r14, rdi
+ adc r15, 0
+ add r11, QWORD PTR [rsp+96]
+ adc r12, QWORD PTR [rsp+104]
+ adc r13, QWORD PTR [rsp+112]
+ adc r14, QWORD PTR [rsp+120]
+ adc r15, 0
+ shld r15, r14, 1
+ btr r14, 63
+ imul r15, r15, 19
+ add r11, r15
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ mov QWORD PTR [rsp+96], r11
+ mov QWORD PTR [rsp+104], r12
+ mov QWORD PTR [rsp+112], r13
+ mov QWORD PTR [rsp+120], r14
+ mov rax, QWORD PTR [r8]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+32]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [r8+16]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+40]
+ mulx rdi, r15, QWORD PTR [r8+24]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [r8+16]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+48]
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+40]
+ mulx rcx, rdx, QWORD PTR [r8+8]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+56]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [r8+8]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+48]
+ mulx rcx, rdx, QWORD PTR [r8+16]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+56]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [r8+24]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r8+24]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+32]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r8+24]
+ mulx rcx, rdx, QWORD PTR [rsp+48]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+56]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [r8+16]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [rsp+32], r11
+ mov QWORD PTR [rsp+40], r12
+ mov QWORD PTR [rsp+48], r13
+ mov QWORD PTR [rsp+56], r14
+ mov rax, QWORD PTR [rsp+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+128]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+112]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rdi, r15, QWORD PTR [rsp+120]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+112]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rcx, rdx, QWORD PTR [rsp+104]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+120]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+128]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+120]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+112]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [rsp], r11
+ mov QWORD PTR [rsp+8], r12
+ mov QWORD PTR [rsp+16], r13
+ mov QWORD PTR [rsp+24], r14
+ mov rbx, QWORD PTR [rsp+160]
+ dec rbx
+ cmp rbx, 3
+ jge L_curve25519_avx2_bits
+ mov QWORD PTR [rsp+160], 2
+ mov rax, QWORD PTR [rsp+176]
+ neg rax
+ ; Conditional Swap
+ mov r11, QWORD PTR [r9]
+ mov r12, QWORD PTR [r9+8]
+ mov r13, QWORD PTR [r9+16]
+ mov r14, QWORD PTR [r9+24]
+ mov r15, QWORD PTR [rsp]
+ mov rdi, QWORD PTR [rsp+8]
+ mov rsi, QWORD PTR [rsp+16]
+ mov rbp, QWORD PTR [rsp+24]
+ xor r11, QWORD PTR [rsp+64]
+ xor r12, QWORD PTR [rsp+72]
+ xor r13, QWORD PTR [rsp+80]
+ xor r14, QWORD PTR [rsp+88]
+ xor r15, QWORD PTR [rsp+32]
+ xor rdi, QWORD PTR [rsp+40]
+ xor rsi, QWORD PTR [rsp+48]
+ xor rbp, QWORD PTR [rsp+56]
+ and r11, rax
+ and r12, rax
+ and r13, rax
+ and r14, rax
+ and r15, rax
+ and rdi, rax
+ and rsi, rax
+ and rbp, rax
+ xor QWORD PTR [r9], r11
+ xor QWORD PTR [r9+8], r12
+ xor QWORD PTR [r9+16], r13
+ xor QWORD PTR [r9+24], r14
+ xor QWORD PTR [rsp], r15
+ xor QWORD PTR [rsp+8], rdi
+ xor QWORD PTR [rsp+16], rsi
+ xor QWORD PTR [rsp+24], rbp
+ xor QWORD PTR [rsp+64], r11
+ xor QWORD PTR [rsp+72], r12
+ xor QWORD PTR [rsp+80], r13
+ xor QWORD PTR [rsp+88], r14
+ xor QWORD PTR [rsp+32], r15
+ xor QWORD PTR [rsp+40], rdi
+ xor QWORD PTR [rsp+48], rsi
+ xor QWORD PTR [rsp+56], rbp
+L_curve25519_avx2_last_3:
+ ; Add-Sub
+ ; Add
+ mov r11, QWORD PTR [r9]
+ mov r12, QWORD PTR [r9+8]
+ mov r13, QWORD PTR [r9+16]
+ mov r14, QWORD PTR [r9+24]
+ mov r15, r11
+ add r11, QWORD PTR [rsp]
+ mov rdi, r12
+ adc r12, QWORD PTR [rsp+8]
+ mov rsi, r13
+ adc r13, QWORD PTR [rsp+16]
+ mov rbp, r14
+ adc r14, QWORD PTR [rsp+24]
+ mov rbx, 0
+ adc rbx, 0
+ shld rbx, r14, 1
+ imul rbx, 19
+ btr r14, 63
+ ; Sub modulus (if overflow)
+ add r11, rbx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ ; Sub
+ sub r15, QWORD PTR [rsp]
+ sbb rdi, QWORD PTR [rsp+8]
+ sbb rsi, QWORD PTR [rsp+16]
+ sbb rbp, QWORD PTR [rsp+24]
+ sbb rbx, rbx
+ shld rbx, rbp, 1
+ imul rbx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub r15, rbx
+ sbb rdi, 0
+ sbb rsi, 0
+ sbb rbp, 0
+ mov QWORD PTR [r9], r11
+ mov QWORD PTR [r9+8], r12
+ mov QWORD PTR [r9+16], r13
+ mov QWORD PTR [r9+24], r14
+ mov QWORD PTR [rsp+128], r15
+ mov QWORD PTR [rsp+136], rdi
+ mov QWORD PTR [rsp+144], rsi
+ mov QWORD PTR [rsp+152], rbp
+ ; Square
+ mov rdx, QWORD PTR [rsp+128]
+ mov rax, QWORD PTR [rsp+136]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [rsp+152]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [rsp+152]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsp+152]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsp+144]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+96], r11
+ mov QWORD PTR [rsp+104], r12
+ mov QWORD PTR [rsp+112], r13
+ mov QWORD PTR [rsp+120], r14
+ ; Square
+ mov rdx, QWORD PTR [r9]
+ mov rax, QWORD PTR [r9+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r13, r12, rax
+ ; A[0] * A[3]
+ mulx r15, r14, QWORD PTR [r9+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [r9+16]
+ mulx rbx, rcx, rax
+ xor r11, r11
+ adox r14, rcx
+ ; A[2] * A[3]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adox r15, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, rbp
+ adox rdi, r11
+ adcx r13, rcx
+ adox rsi, r11
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [r9+24]
+ adcx r14, rbx
+ adcx r15, rcx
+ adcx rdi, rdx
+ adcx rsi, r11
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx rcx, r11, rdx
+ xor rbp, rbp
+ adcx r12, r12
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r12, rcx
+ mulx rbx, rcx, rdx
+ adcx r13, r13
+ adox r13, rcx
+ adcx r14, r14
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [r9+16]
+ adox r14, rbx
+ mulx rcx, rbx, rdx
+ adcx r15, r15
+ adox r15, rbx
+ adcx rdi, rdi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [r9+24]
+ adox rdi, rcx
+ mulx rbx, rcx, rdx
+ adcx rsi, rsi
+ adox rsi, rcx
+ adcx rbp, rbp
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rbx, rbp, rbp
+ add r14, rbp
+ adc rbx, 0
+ mov rcx, 9223372036854775807
+ shld rbx, r14, 1
+ imul rbx, rbx, 19
+ and r14, rcx
+ xor rcx, rcx
+ adox r11, rbx
+ mulx r15, rbx, r15
+ adcx r11, rbx
+ adox r12, r15
+ mulx rdi, rbx, rdi
+ adcx r12, rbx
+ adox r13, rdi
+ mulx rsi, rbx, rsi
+ adcx r13, rbx
+ adox r14, rsi
+ adcx r14, rcx
+ ; Store
+ mov QWORD PTR [rsp+128], r11
+ mov QWORD PTR [rsp+136], r12
+ mov QWORD PTR [rsp+144], r13
+ mov QWORD PTR [rsp+152], r14
+ mov rax, QWORD PTR [rsp+128]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+96]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+144]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rdi, r15, QWORD PTR [rsp+152]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+144]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+104]
+ mulx rcx, rdx, QWORD PTR [rsp+136]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+136]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+112]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+152]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+96]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+152]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+144]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [r9], r11
+ mov QWORD PTR [r9+8], r12
+ mov QWORD PTR [r9+16], r13
+ mov QWORD PTR [r9+24], r14
+ ; Sub
+ mov r11, QWORD PTR [rsp+128]
+ mov r12, QWORD PTR [rsp+136]
+ mov r13, QWORD PTR [rsp+144]
+ mov r14, QWORD PTR [rsp+152]
+ sub r11, QWORD PTR [rsp+96]
+ sbb r12, QWORD PTR [rsp+104]
+ sbb r13, QWORD PTR [rsp+112]
+ sbb r14, QWORD PTR [rsp+120]
+ sbb rbx, rbx
+ shld rbx, r14, 1
+ imul rbx, -19
+ btr r14, 63
+ ; Add modulus (if underflow)
+ sub r11, rbx
+ sbb r12, 0
+ sbb r13, 0
+ sbb r14, 0
+ mov QWORD PTR [rsp+128], r11
+ mov QWORD PTR [rsp+136], r12
+ mov QWORD PTR [rsp+144], r13
+ mov QWORD PTR [rsp+152], r14
+ mov rdx, 121666
+ mulx rbp, r11, QWORD PTR [rsp+128]
+ mulx rsi, r12, QWORD PTR [rsp+136]
+ mulx rdi, r13, QWORD PTR [rsp+144]
+ add r12, rbp
+ mulx r15, r14, QWORD PTR [rsp+152]
+ adc r13, rsi
+ adc r14, rdi
+ adc r15, 0
+ add r11, QWORD PTR [rsp+96]
+ adc r12, QWORD PTR [rsp+104]
+ adc r13, QWORD PTR [rsp+112]
+ adc r14, QWORD PTR [rsp+120]
+ adc r15, 0
+ shld r15, r14, 1
+ btr r14, 63
+ imul r15, r15, 19
+ add r11, r15
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ mov QWORD PTR [rsp+96], r11
+ mov QWORD PTR [rsp+104], r12
+ mov QWORD PTR [rsp+112], r13
+ mov QWORD PTR [rsp+120], r14
+ mov rax, QWORD PTR [rsp+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp+128]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [rsp+112]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rdi, r15, QWORD PTR [rsp+120]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [rsp+112]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+136]
+ mulx rcx, rdx, QWORD PTR [rsp+104]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+104]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+144]
+ mulx rcx, rdx, QWORD PTR [rsp+112]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [rsp+120]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rsp+120]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp+128]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rsp+120]
+ mulx rcx, rdx, QWORD PTR [rsp+144]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+152]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [rsp+112]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ ; Store
+ mov QWORD PTR [rsp], r11
+ mov QWORD PTR [rsp+8], r12
+ mov QWORD PTR [rsp+16], r13
+ mov QWORD PTR [rsp+24], r14
+ dec QWORD PTR [rsp+160]
+ jge L_curve25519_avx2_last_3
+ ; Invert
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 4
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 19
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+128]
+ lea rdx, QWORD PTR [rsp+128]
+ mov r8, 99
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+128]
+ lea r8, QWORD PTR [rsp+96]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+96]
+ lea rdx, QWORD PTR [rsp+96]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+96]
+ lea r8, QWORD PTR [rsp+64]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 4
+ call fe_sq_n_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ mov r9, QWORD PTR [rsp+168]
+ mov rax, QWORD PTR [r9]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [rsp]
+ mulx r12, r11, rax
+ ; A[2] * B[0]
+ mulx r14, r13, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx rbx, rcx, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r12, rcx
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [rsp+8]
+ mulx rdi, r15, QWORD PTR [r9+24]
+ adcx r13, rbx
+ ; A[0] * B[1]
+ mulx rbx, rcx, rax
+ adox r12, rcx
+ ; A[2] * B[1]
+ mulx rsi, rcx, QWORD PTR [r9+16]
+ adox r13, rbx
+ adcx r14, rcx
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [rsp+16]
+ mulx rbx, rcx, QWORD PTR [r9+8]
+ adcx r15, rsi
+ adox r14, rcx
+ adcx rdi, rbp
+ adox r15, rbx
+ ; A[0] * B[2]
+ mulx rbx, rcx, rax
+ adox rdi, rbp
+ xor rsi, rsi
+ adcx r13, rcx
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [rsp+8]
+ mulx rcx, rdx, QWORD PTR [r9+8]
+ adcx r14, rbx
+ adox r13, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adox r14, rcx
+ mulx rbx, rcx, QWORD PTR [r9+8]
+ adcx r15, rcx
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [rsp+16]
+ mulx rcx, rdx, QWORD PTR [r9+16]
+ adcx rdi, rbx
+ adox r15, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adox rdi, rcx
+ mulx rbx, rcx, QWORD PTR [r9+24]
+ adox rsi, rbp
+ adcx rsi, rcx
+ ; A[0] * B[3]
+ mulx rcx, rdx, rax
+ adcx rbp, rbx
+ xor rbx, rbx
+ adcx r14, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx r15, rcx
+ mulx rcx, rdx, QWORD PTR [rsp]
+ adox r14, rdx
+ adox r15, rcx
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx rcx, rdx, QWORD PTR [rsp+16]
+ adcx rdi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [rsp+24]
+ adcx rsi, rcx
+ mulx rdx, rcx, QWORD PTR [r9+16]
+ adcx rbp, rbx
+ adox rdi, rcx
+ adox rsi, rdx
+ adox rbp, rbx
+ mov rdx, 38
+ mulx rcx, rbp, rbp
+ add r14, rbp
+ adc rcx, 0
+ mov rbx, 9223372036854775807
+ shld rcx, r14, 1
+ imul rcx, rcx, 19
+ and r14, rbx
+ xor rbx, rbx
+ adox r11, rcx
+ mulx r15, rcx, r15
+ adcx r11, rcx
+ adox r12, r15
+ mulx rdi, rcx, rdi
+ adcx r12, rcx
+ adox r13, rdi
+ mulx rsi, rcx, rsi
+ adcx r13, rcx
+ adox r14, rsi
+ adcx r14, rbx
+ mov rbx, 9223372036854775807
+ mov rdx, r14
+ sar rdx, 63
+ and rdx, 19
+ and r14, rbx
+ add r11, rdx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ mov rcx, 9223372036854775807
+ mov rdx, r11
+ add rdx, 19
+ mov rdx, r12
+ adc rdx, 0
+ mov rdx, r13
+ adc rdx, 0
+ mov rdx, r14
+ adc rdx, 0
+ sar rdx, 63
+ and rdx, 19
+ and r14, rcx
+ add r11, rdx
+ adc r12, 0
+ adc r13, 0
+ adc r14, 0
+ and r14, rcx
+ ; Store
+ mov QWORD PTR [r9], r11
+ mov QWORD PTR [r9+8], r12
+ mov QWORD PTR [r9+16], r13
+ mov QWORD PTR [r9+24], r14
+ xor rax, rax
+ add rsp, 184
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+curve25519_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+fe_pow22523_avx2 PROC
+ sub rsp, 112
+ ; pow22523
+ mov QWORD PTR [rsp+96], rcx
+ mov QWORD PTR [rsp+104], rdx
+ mov rcx, rsp
+ mov rdx, QWORD PTR [rsp+104]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, QWORD PTR [rsp+104]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ mov rcx, rsp
+ mov rdx, rsp
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 4
+ call fe_sq_n_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 9
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 19
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 9
+ call fe_sq_n_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ mov rdx, rsp
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 49
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+64]
+ lea rdx, QWORD PTR [rsp+64]
+ mov r8, 99
+ call fe_sq_n_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+64]
+ lea r8, QWORD PTR [rsp+32]
+ call fe_mul_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ call fe_sq_avx2
+ lea rcx, QWORD PTR [rsp+32]
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, 49
+ call fe_sq_n_avx2
+ mov rcx, rsp
+ lea rdx, QWORD PTR [rsp+32]
+ mov r8, rsp
+ call fe_mul_avx2
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_avx2
+ mov rcx, rsp
+ mov rdx, rsp
+ call fe_sq_avx2
+ mov rcx, QWORD PTR [rsp+96]
+ mov rdx, rsp
+ mov r8, QWORD PTR [rsp+104]
+ call fe_mul_avx2
+ mov rdx, QWORD PTR [rsp+104]
+ mov rcx, QWORD PTR [rsp+96]
+ add rsp, 112
+ ret
+fe_pow22523_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p2_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ lea r8, QWORD PTR [rax+96]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea rax, QWORD PTR [rax+64]
+ lea rcx, QWORD PTR [rcx+64]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [rax+-32]
+ lea rcx, QWORD PTR [rcx+-32]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add rsp, 16
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p1p1_to_p2_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p1p1_to_p3_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ lea r8, QWORD PTR [rax+96]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [rax+32]
+ lea rcx, QWORD PTR [rcx+96]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea rax, QWORD PTR [rax+64]
+ lea rcx, QWORD PTR [rcx+-64]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [rax+32]
+ lea rcx, QWORD PTR [rcx+32]
+ mov r11, QWORD PTR [rax]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, r11
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r10, r9, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r9
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r10
+ ; A[0] * B[1]
+ mulx r10, r9, r11
+ adox r13, r9
+ ; A[2] * B[1]
+ mulx rbx, r9, QWORD PTR [rax+16]
+ adox r14, r10
+ adcx r15, r9
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r9
+ adcx rsi, rbp
+ adox rdi, r10
+ ; A[0] * B[2]
+ mulx r10, r9, r11
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r9
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r9, rdx, QWORD PTR [rax+8]
+ adcx r15, r10
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r9
+ mulx r10, r9, QWORD PTR [rax+8]
+ adcx rdi, r9
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r9, rdx, QWORD PTR [rax+16]
+ adcx rsi, r10
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r9
+ ; A[0] * B[3]
+ mulx r9, rdx, r11
+ adcx rbp, r10
+ xor r10, r10
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r9
+ mulx r9, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r9
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r9, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r9
+ mulx rdx, r9, QWORD PTR [rax+16]
+ adcx rbp, r10
+ adox rsi, r9
+ adox rbx, rdx
+ adox rbp, r10
+ mov rdx, 38
+ mulx r9, rbp, rbp
+ add r15, rbp
+ adc r9, 0
+ mov r10, 9223372036854775807
+ shld r9, r15, 1
+ imul r9, r9, 19
+ and r15, r10
+ xor r10, r10
+ adox r12, r9
+ mulx rdi, r9, rdi
+ adcx r12, r9
+ adox r13, rdi
+ mulx rsi, r9, rsi
+ adcx r13, r9
+ adox r14, rsi
+ mulx rbx, r9, rbx
+ adcx r14, r9
+ adox r15, rbx
+ adcx r15, r10
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add rsp, 16
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p1p1_to_p3_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_p2_dbl_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 16
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ lea rcx, QWORD PTR [rcx+64]
+ ; Square
+ mov rdx, QWORD PTR [rax]
+ mov r11, QWORD PTR [rax+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r14, r13, r11
+ ; A[0] * A[3]
+ mulx rdi, r15, QWORD PTR [rax+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rax+16]
+ mulx r10, r9, r11
+ xor r12, r12
+ adox r15, r9
+ ; A[2] * A[3]
+ mulx rbx, rsi, QWORD PTR [rax+24]
+ adox rdi, r10
+ ; A[2] * A[0]
+ mulx r10, r9, rbp
+ adox rsi, r12
+ adcx r14, r9
+ adox rbx, r12
+ ; A[1] * A[3]
+ mov rdx, r11
+ mulx rdx, r9, QWORD PTR [rax+24]
+ adcx r15, r10
+ adcx rdi, r9
+ adcx rsi, rdx
+ adcx rbx, r12
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx r9, r12, rdx
+ xor rbp, rbp
+ adcx r13, r13
+ ; A[1] * A[1]
+ mov rdx, r11
+ adox r13, r9
+ mulx r10, r9, rdx
+ adcx r14, r14
+ adox r14, r9
+ adcx r15, r15
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rax+16]
+ adox r15, r10
+ mulx r9, r10, rdx
+ adcx rdi, rdi
+ adox rdi, r10
+ adcx rsi, rsi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rax+24]
+ adox rsi, r9
+ mulx r10, r9, rdx
+ adcx rbx, rbx
+ adox rbx, r9
+ adcx rbp, rbp
+ adox rbp, r10
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r9, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r9
+ xor r9, r9
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r9
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea rax, QWORD PTR [rax+32]
+ ; Square
+ mov rdx, QWORD PTR [rax]
+ mov r11, QWORD PTR [rax+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r14, r13, r11
+ ; A[0] * A[3]
+ mulx rdi, r15, QWORD PTR [rax+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rax+16]
+ mulx r10, r9, r11
+ xor r12, r12
+ adox r15, r9
+ ; A[2] * A[3]
+ mulx rbx, rsi, QWORD PTR [rax+24]
+ adox rdi, r10
+ ; A[2] * A[0]
+ mulx r10, r9, rbp
+ adox rsi, r12
+ adcx r14, r9
+ adox rbx, r12
+ ; A[1] * A[3]
+ mov rdx, r11
+ mulx rdx, r9, QWORD PTR [rax+24]
+ adcx r15, r10
+ adcx rdi, r9
+ adcx rsi, rdx
+ adcx rbx, r12
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx r9, r12, rdx
+ xor rbp, rbp
+ adcx r13, r13
+ ; A[1] * A[1]
+ mov rdx, r11
+ adox r13, r9
+ mulx r10, r9, rdx
+ adcx r14, r14
+ adox r14, r9
+ adcx r15, r15
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rax+16]
+ adox r15, r10
+ mulx r9, r10, rdx
+ adcx rdi, rdi
+ adox rdi, r10
+ adcx rsi, rsi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rax+24]
+ adox rsi, r9
+ mulx r10, r9, rdx
+ adcx rbx, rbx
+ adox rbx, r9
+ adcx rbp, rbp
+ adox rbp, r10
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r9, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r9
+ xor r9, r9
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r9
+ ; Store
+ mov rax, rcx
+ lea rcx, QWORD PTR [rcx+-32]
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ mov r8, QWORD PTR [rsp+8]
+ lea rax, QWORD PTR [r8+32]
+ lea rcx, QWORD PTR [rcx+-32]
+ ; Add
+ mov r12, QWORD PTR [rax]
+ mov r13, QWORD PTR [rax+8]
+ add r12, QWORD PTR [r8]
+ mov r14, QWORD PTR [rax+16]
+ adc r13, QWORD PTR [r8+8]
+ mov r15, QWORD PTR [rax+24]
+ adc r14, QWORD PTR [r8+16]
+ adc r15, QWORD PTR [r8+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ ; Square
+ mov rdx, QWORD PTR [rcx]
+ mov r11, QWORD PTR [rcx+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r14, r13, r11
+ ; A[0] * A[3]
+ mulx rdi, r15, QWORD PTR [rcx+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rcx+16]
+ mulx r10, r9, r11
+ xor r12, r12
+ adox r15, r9
+ ; A[2] * A[3]
+ mulx rbx, rsi, QWORD PTR [rcx+24]
+ adox rdi, r10
+ ; A[2] * A[0]
+ mulx r10, r9, rbp
+ adox rsi, r12
+ adcx r14, r9
+ adox rbx, r12
+ ; A[1] * A[3]
+ mov rdx, r11
+ mulx rdx, r9, QWORD PTR [rcx+24]
+ adcx r15, r10
+ adcx rdi, r9
+ adcx rsi, rdx
+ adcx rbx, r12
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx r9, r12, rdx
+ xor rbp, rbp
+ adcx r13, r13
+ ; A[1] * A[1]
+ mov rdx, r11
+ adox r13, r9
+ mulx r10, r9, rdx
+ adcx r14, r14
+ adox r14, r9
+ adcx r15, r15
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rcx+16]
+ adox r15, r10
+ mulx r9, r10, rdx
+ adcx rdi, rdi
+ adox rdi, r10
+ adcx rsi, rsi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rcx+24]
+ adox rsi, r9
+ mulx r10, r9, rdx
+ adcx rbx, rbx
+ adox rbx, r9
+ adcx rbp, rbp
+ adox rbp, r10
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r9, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r9
+ xor r9, r9
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r9
+ ; Store
+ lea rax, QWORD PTR [rcx+32]
+ ; Sub
+ sub r12, QWORD PTR [rax]
+ sbb r13, QWORD PTR [rax+8]
+ sbb r14, QWORD PTR [rax+16]
+ sbb r15, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, r15, 1
+ imul rdx, -19
+ btr r15, 63
+ ; Add modulus (if underflow)
+ sub r12, rdx
+ sbb r13, 0
+ sbb r14, 0
+ sbb r15, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [r8+64]
+ ; Square * 2
+ mov rdx, QWORD PTR [r8]
+ mov r11, QWORD PTR [r8+8]
+ ; A[0] * A[1]
+ mov rbp, rdx
+ mulx r14, r13, r11
+ ; A[0] * A[3]
+ mulx rdi, r15, QWORD PTR [r8+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, r9, r11
+ xor r12, r12
+ adox r15, r9
+ ; A[2] * A[3]
+ mulx rbx, rsi, QWORD PTR [r8+24]
+ adox rdi, r10
+ ; A[2] * A[0]
+ mulx r10, r9, rbp
+ adox rsi, r12
+ adcx r14, r9
+ adox rbx, r12
+ ; A[1] * A[3]
+ mov rdx, r11
+ mulx rdx, r9, QWORD PTR [r8+24]
+ adcx r15, r10
+ adcx rdi, r9
+ adcx rsi, rdx
+ adcx rbx, r12
+ ; A[0] * A[0]
+ mov rdx, rbp
+ mulx r9, r12, rdx
+ xor rbp, rbp
+ adcx r13, r13
+ ; A[1] * A[1]
+ mov rdx, r11
+ adox r13, r9
+ mulx r10, r9, rdx
+ adcx r14, r14
+ adox r14, r9
+ adcx r15, r15
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [r8+16]
+ adox r15, r10
+ mulx r9, r10, rdx
+ adcx rdi, rdi
+ adox rdi, r10
+ adcx rsi, rsi
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r9
+ mulx r10, r9, rdx
+ adcx rbx, rbx
+ adox rbx, r9
+ adcx rbp, rbp
+ adox rbp, r10
+ mov rdx, 38
+ mulx r11, rbp, rbp
+ add r15, rbp
+ adc r11, 0
+ mov r9, 9223372036854775807
+ shld r11, r15, 1
+ imul r11, r11, 19
+ and r15, r9
+ xor r9, r9
+ adox r12, r11
+ mulx rdi, r11, rdi
+ adcx r12, r11
+ adox r13, rdi
+ mulx rsi, r11, rsi
+ adcx r13, r11
+ adox r14, rsi
+ mulx rbx, r11, rbx
+ adcx r14, r11
+ adox r15, rbx
+ adcx r15, r9
+ mov r11, r15
+ shld r15, r14, 1
+ shld r14, r13, 1
+ shld r13, r12, 1
+ shl r12, 1
+ mov r9, 9223372036854775807
+ shr r11, 62
+ and r15, r9
+ imul r11, r11, 19
+ add r12, r11
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Store
+ lea rax, QWORD PTR [rcx+64]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Sub
+ sub r12, QWORD PTR [rax]
+ sbb r13, QWORD PTR [rax+8]
+ sbb r14, QWORD PTR [rax+16]
+ sbb r15, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, r15, 1
+ imul rdx, -19
+ btr r15, 63
+ ; Add modulus (if underflow)
+ sub r12, rdx
+ sbb r13, 0
+ sbb r14, 0
+ sbb r15, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ add rsp, 16
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_p2_dbl_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_madd_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ mov QWORD PTR [rsp+16], r8
+ lea r9, QWORD PTR [rax+96]
+ lea r8, QWORD PTR [r8+64]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov r9, rax
+ lea r8, QWORD PTR [rax+32]
+ lea rax, QWORD PTR [rcx+-64]
+ lea rcx, QWORD PTR [rcx+-96]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r8]
+ mov r13, QWORD PTR [r8+8]
+ mov r14, QWORD PTR [r8+16]
+ mov r15, QWORD PTR [r8+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r9]
+ mov rsi, r13
+ adc r13, QWORD PTR [r9+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r9+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r9+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r9]
+ sbb rsi, QWORD PTR [r9+8]
+ sbb rbx, QWORD PTR [r9+16]
+ sbb rbp, QWORD PTR [r9+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ mov r8, QWORD PTR [rsp+16]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rcx]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rcx+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rcx+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rcx+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rcx+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rcx+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rcx+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rcx]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rcx+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rcx+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rcx+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [r8+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rax]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rax]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rax+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rax]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rax+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rax+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rax]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rax+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ lea r9, QWORD PTR [r9+64]
+ ; Double
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ add r12, r12
+ mov r14, QWORD PTR [r9+16]
+ adc r13, r13
+ mov r15, QWORD PTR [r9+24]
+ adc r14, r14
+ adc r15, r15
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ lea rax, QWORD PTR [rcx+96]
+ lea rcx, QWORD PTR [rcx+64]
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_madd_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_msub_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ mov QWORD PTR [rsp+16], r8
+ lea r9, QWORD PTR [rax+96]
+ lea r8, QWORD PTR [r8+64]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov r9, rax
+ lea r8, QWORD PTR [rax+32]
+ lea rax, QWORD PTR [rcx+-64]
+ lea rcx, QWORD PTR [rcx+-96]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r8]
+ mov r13, QWORD PTR [r8+8]
+ mov r14, QWORD PTR [r8+16]
+ mov r15, QWORD PTR [r8+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r9]
+ mov rsi, r13
+ adc r13, QWORD PTR [r9+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r9+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r9+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r9]
+ sbb rsi, QWORD PTR [r9+8]
+ sbb rbx, QWORD PTR [r9+16]
+ sbb rbp, QWORD PTR [r9+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ mov r8, QWORD PTR [rsp+16]
+ lea r8, QWORD PTR [r8+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rcx]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rcx+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rcx+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rcx+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rcx+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rcx+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rcx+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rcx]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rcx+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rcx+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rcx+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [r8+-32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rax]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rax]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rax+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rax]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rax+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rax+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rax]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rax+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ lea r9, QWORD PTR [r9+64]
+ ; Double
+ mov r12, QWORD PTR [r9]
+ mov r13, QWORD PTR [r9+8]
+ add r12, r12
+ mov r14, QWORD PTR [r9+16]
+ adc r13, r13
+ mov r15, QWORD PTR [r9+24]
+ adc r14, r14
+ adc r15, r15
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ lea rax, QWORD PTR [rcx+96]
+ lea rcx, QWORD PTR [rcx+64]
+ ; Add-Sub
+ ; Add
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_msub_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_add_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ mov QWORD PTR [rsp+16], r8
+ lea r9, QWORD PTR [rax+96]
+ lea r8, QWORD PTR [r8+96]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov r9, rax
+ lea r8, QWORD PTR [rax+32]
+ lea rax, QWORD PTR [rcx+-64]
+ lea rcx, QWORD PTR [rcx+-96]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r8]
+ mov r13, QWORD PTR [r8+8]
+ mov r14, QWORD PTR [r8+16]
+ mov r15, QWORD PTR [r8+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r9]
+ mov rsi, r13
+ adc r13, QWORD PTR [r9+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r9+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r9+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r9]
+ sbb rsi, QWORD PTR [r9+8]
+ sbb rbx, QWORD PTR [r9+16]
+ sbb rbp, QWORD PTR [r9+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ mov r8, QWORD PTR [rsp+16]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rcx]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rcx+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rcx+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rcx+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rcx+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rcx+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rcx+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rcx]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rcx+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rcx+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rcx+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [r8+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rax]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rax]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rax+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rax]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rax+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rax+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rax]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rax+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ lea r9, QWORD PTR [r9+64]
+ lea r8, QWORD PTR [r8+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ lea rcx, QWORD PTR [rcx+64]
+ ; Double
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea rcx, QWORD PTR [rcx+-64]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ lea rax, QWORD PTR [rcx+96]
+ lea rcx, QWORD PTR [rcx+64]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_add_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+ge_sub_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rdx
+ sub rsp, 24
+ mov QWORD PTR [rsp], rcx
+ mov QWORD PTR [rsp+8], rax
+ mov QWORD PTR [rsp+16], r8
+ lea r9, QWORD PTR [rax+96]
+ lea r8, QWORD PTR [r8+96]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov r9, rax
+ lea r8, QWORD PTR [rax+32]
+ lea rax, QWORD PTR [rcx+-64]
+ lea rcx, QWORD PTR [rcx+-96]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [r8]
+ mov r13, QWORD PTR [r8+8]
+ mov r14, QWORD PTR [r8+16]
+ mov r15, QWORD PTR [r8+24]
+ mov rdi, r12
+ add r12, QWORD PTR [r9]
+ mov rsi, r13
+ adc r13, QWORD PTR [r9+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [r9+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [r9+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [r9]
+ sbb rsi, QWORD PTR [r9+8]
+ sbb rbx, QWORD PTR [r9+16]
+ sbb rbp, QWORD PTR [r9+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ mov r8, QWORD PTR [rsp+16]
+ lea r8, QWORD PTR [r8+32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rcx]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rcx+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rcx+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rcx+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rcx]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rcx+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rcx+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rcx+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rcx+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rcx]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rcx+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rcx+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rcx+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea r8, QWORD PTR [r8+-32]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [rax]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [rax+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [rax+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [rax+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [rax]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [rax+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [rax]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [rax+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [rax+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [rax+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [rax+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [rax]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [rax+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [rax+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [rax+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ lea r9, QWORD PTR [r9+64]
+ lea r8, QWORD PTR [r8+64]
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r8]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx r11, r10, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, r10
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, r11
+ ; A[0] * B[1]
+ mulx r11, r10, QWORD PTR [r9]
+ adox r13, r10
+ ; A[2] * B[1]
+ mulx rbx, r10, QWORD PTR [r9+16]
+ adox r14, r11
+ adcx r15, r10
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, r10
+ adcx rsi, rbp
+ adox rdi, r11
+ ; A[0] * B[2]
+ mulx r11, r10, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, r10
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r8+8]
+ mulx r10, rdx, QWORD PTR [r9+8]
+ adcx r15, r11
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox r15, r10
+ mulx r11, r10, QWORD PTR [r9+8]
+ adcx rdi, r10
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r8+16]
+ mulx r10, rdx, QWORD PTR [r9+16]
+ adcx rsi, r11
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adox rsi, r10
+ mulx r11, r10, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, r10
+ ; A[0] * B[3]
+ mulx r10, rdx, QWORD PTR [r9]
+ adcx rbp, r11
+ xor r11, r11
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, r10
+ mulx r10, rdx, QWORD PTR [r8]
+ adox r15, rdx
+ adox rdi, r10
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx r10, rdx, QWORD PTR [r8+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r8+24]
+ adcx rbx, r10
+ mulx rdx, r10, QWORD PTR [r9+16]
+ adcx rbp, r11
+ adox rsi, r10
+ adox rbx, rdx
+ adox rbp, r11
+ mov rdx, 38
+ mulx r10, rbp, rbp
+ add r15, rbp
+ adc r10, 0
+ mov r11, 9223372036854775807
+ shld r10, r15, 1
+ imul r10, r10, 19
+ and r15, r11
+ xor r11, r11
+ adox r12, r10
+ mulx rdi, r10, rdi
+ adcx r12, r10
+ adox r13, rdi
+ mulx rsi, r10, rsi
+ adcx r13, r10
+ adox r14, rsi
+ mulx rbx, r10, rbx
+ adcx r14, r10
+ adox r15, rbx
+ adcx r15, r11
+ ; Store
+ lea rcx, QWORD PTR [rcx+64]
+ ; Double
+ add r12, r12
+ adc r13, r13
+ adc r14, r14
+ adc r15, r15
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ lea rcx, QWORD PTR [rcx+-64]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rax]
+ mov rsi, r13
+ adc r13, QWORD PTR [rax+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rax+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rax+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rax]
+ sbb rsi, QWORD PTR [rax+8]
+ sbb rbx, QWORD PTR [rax+16]
+ sbb rbp, QWORD PTR [rax+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rax], r12
+ mov QWORD PTR [rax+8], r13
+ mov QWORD PTR [rax+16], r14
+ mov QWORD PTR [rax+24], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+16], rbx
+ mov QWORD PTR [rcx+24], rbp
+ lea rax, QWORD PTR [rcx+64]
+ lea rcx, QWORD PTR [rcx+96]
+ ; Add-Sub
+ ; Add
+ mov r12, QWORD PTR [rax]
+ mov r13, QWORD PTR [rax+8]
+ mov r14, QWORD PTR [rax+16]
+ mov r15, QWORD PTR [rax+24]
+ mov rdi, r12
+ add r12, QWORD PTR [rcx]
+ mov rsi, r13
+ adc r13, QWORD PTR [rcx+8]
+ mov rbx, r14
+ adc r14, QWORD PTR [rcx+16]
+ mov rbp, r15
+ adc r15, QWORD PTR [rcx+24]
+ mov rdx, 0
+ adc rdx, 0
+ shld rdx, r15, 1
+ imul rdx, 19
+ btr r15, 63
+ ; Sub modulus (if overflow)
+ add r12, rdx
+ adc r13, 0
+ adc r14, 0
+ adc r15, 0
+ ; Sub
+ sub rdi, QWORD PTR [rcx]
+ sbb rsi, QWORD PTR [rcx+8]
+ sbb rbx, QWORD PTR [rcx+16]
+ sbb rbp, QWORD PTR [rcx+24]
+ sbb rdx, rdx
+ shld rdx, rbp, 1
+ imul rdx, -19
+ btr rbp, 63
+ ; Add modulus (if underflow)
+ sub rdi, rdx
+ sbb rsi, 0
+ sbb rbx, 0
+ sbb rbp, 0
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rax], rdi
+ mov QWORD PTR [rax+8], rsi
+ mov QWORD PTR [rax+16], rbx
+ mov QWORD PTR [rax+24], rbp
+ add rsp, 24
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+ge_sub_avx2 ENDP
+_TEXT ENDS
+IFDEF HAVE_ED25519
+_TEXT SEGMENT READONLY PARA
+fe_sq2_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ ; Square * 2
+ mov rdx, QWORD PTR [rsi]
+ mov rax, QWORD PTR [rsi+8]
+ ; A[0] * A[1]
+ mov r15, rdx
+ mulx r10, r9, rax
+ ; A[0] * A[3]
+ mulx r12, r11, QWORD PTR [rsi+24]
+ ; A[2] * A[1]
+ mov rdx, QWORD PTR [rsi+16]
+ mulx rbx, rcx, rax
+ xor r8, r8
+ adox r11, rcx
+ ; A[2] * A[3]
+ mulx r14, r13, QWORD PTR [rsi+24]
+ adox r12, rbx
+ ; A[2] * A[0]
+ mulx rbx, rcx, r15
+ adox r13, r8
+ adcx r10, rcx
+ adox r14, r8
+ ; A[1] * A[3]
+ mov rdx, rax
+ mulx rdx, rcx, QWORD PTR [rsi+24]
+ adcx r11, rbx
+ adcx r12, rcx
+ adcx r13, rdx
+ adcx r14, r8
+ ; A[0] * A[0]
+ mov rdx, r15
+ mulx rcx, r8, rdx
+ xor r15, r15
+ adcx r9, r9
+ ; A[1] * A[1]
+ mov rdx, rax
+ adox r9, rcx
+ mulx rbx, rcx, rdx
+ adcx r10, r10
+ adox r10, rcx
+ adcx r11, r11
+ ; A[2] * A[2]
+ mov rdx, QWORD PTR [rsi+16]
+ adox r11, rbx
+ mulx rcx, rbx, rdx
+ adcx r12, r12
+ adox r12, rbx
+ adcx r13, r13
+ ; A[3] * A[3]
+ mov rdx, QWORD PTR [rsi+24]
+ adox r13, rcx
+ mulx rbx, rcx, rdx
+ adcx r14, r14
+ adox r14, rcx
+ adcx r15, r15
+ adox r15, rbx
+ mov rdx, 38
+ mulx rax, r15, r15
+ add r11, r15
+ adc rax, 0
+ mov rcx, 9223372036854775807
+ shld rax, r11, 1
+ imul rax, rax, 19
+ and r11, rcx
+ xor rcx, rcx
+ adox r8, rax
+ mulx r12, rax, r12
+ adcx r8, rax
+ adox r9, r12
+ mulx r13, rax, r13
+ adcx r9, rax
+ adox r10, r13
+ mulx r14, rax, r14
+ adcx r10, rax
+ adox r11, r14
+ adcx r11, rcx
+ mov rax, r11
+ shld r11, r10, 1
+ shld r10, r9, 1
+ shld r9, r8, 1
+ shl r8, 1
+ mov rcx, 9223372036854775807
+ shr rax, 62
+ and r11, rcx
+ imul rax, rax, 19
+ add r8, rax
+ adc r9, 0
+ adc r10, 0
+ adc r11, 0
+ ; Store
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+fe_sq2_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+sc_reduce_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r8, rcx
+ mov r9, QWORD PTR [r8]
+ mov r10, QWORD PTR [r8+8]
+ mov r11, QWORD PTR [r8+16]
+ mov r12, QWORD PTR [r8+24]
+ mov r13, QWORD PTR [r8+32]
+ mov r14, QWORD PTR [r8+40]
+ mov r15, QWORD PTR [r8+48]
+ mov rdi, QWORD PTR [r8+56]
+ mov rax, rdi
+ mov rcx, 1152921504606846975
+ shr rax, 56
+ shld rdi, r15, 4
+ shld r15, r14, 4
+ shld r14, r13, 4
+ shld r13, r12, 4
+ and r12, rcx
+ and rdi, rcx
+ ; Add order times bits 504..511
+ sub r15, rax
+ sbb rdi, 0
+ mov rdx, 16942830013509034793
+ mulx rcx, rsi, rax
+ mov rdx, 12100500283911187475
+ add r14, rsi
+ mulx rbx, rsi, rax
+ adc rcx, 0
+ add r13, rsi
+ adc r14, rbx
+ adc r15, rcx
+ adc rdi, 0
+ ; Sub product of top 4 words and order
+ mov rdx, 12100500283911187475
+ mulx rax, rcx, r13
+ add r9, rcx
+ adc r10, rax
+ mulx rax, rcx, r15
+ adc r11, rcx
+ adc r12, rax
+ mov rsi, 0
+ adc rsi, 0
+ mulx rax, rcx, r14
+ add r10, rcx
+ adc r11, rax
+ mulx rax, rcx, rdi
+ adc r12, rcx
+ adc rsi, rax
+ mov rdx, 16942830013509034793
+ mulx rax, rcx, r13
+ add r10, rcx
+ adc r11, rax
+ mulx rax, rcx, r15
+ adc r12, rcx
+ adc rsi, rax
+ mov rbx, 0
+ adc rbx, 0
+ mulx rax, rcx, r14
+ add r11, rcx
+ adc r12, rax
+ mulx rax, rcx, rdi
+ adc rsi, rcx
+ adc rbx, rax
+ sub r11, r13
+ mov r13, rsi
+ sbb r12, r14
+ mov r14, rbx
+ sbb r13, r15
+ sbb r14, rdi
+ mov rax, r14
+ sar rax, 57
+ ; Conditionally subtract order starting at bit 125
+ mov rsi, 11529215046068469760
+ mov rbx, 14628338529006959229
+ mov rbp, 187989257525064602
+ mov rcx, 144115188075855872
+ and rsi, rax
+ and rbx, rax
+ and rbp, rax
+ and rcx, rax
+ add r10, rsi
+ adc r11, rbx
+ adc r12, rbp
+ adc r13, 0
+ adc r14, rcx
+ ; Move bits 252-376 to own registers
+ mov rax, 1152921504606846975
+ shld r14, r13, 4
+ shld r13, r12, 4
+ and r12, rax
+ ; Sub product of top 2 words and order
+ ; * -5812631a5cf5d3ed
+ mov rdx, 12100500283911187475
+ mulx rax, rbp, r13
+ mov rsi, 0
+ add r9, rbp
+ adc r10, rax
+ mulx rax, rbp, r14
+ adc rsi, 0
+ add r10, rbp
+ adc rsi, rax
+ ; * -14def9dea2f79cd7
+ mov rdx, 16942830013509034793
+ mulx rax, rbp, r13
+ mov rbx, 0
+ add r10, rbp
+ adc r11, rax
+ mulx rax, rbp, r14
+ adc rbx, 0
+ add r11, rbp
+ adc rbx, rax
+ ; Add overflows at 2 * 64
+ mov rcx, 1152921504606846975
+ and r12, rcx
+ add r11, rsi
+ adc r12, rbx
+ ; Subtract top at 2 * 64
+ sub r11, r13
+ sbb r12, r14
+ sbb rcx, rcx
+ ; Conditional sub order
+ mov rsi, 6346243789798364141
+ mov rbx, 1503914060200516822
+ mov rbp, 1152921504606846976
+ and rsi, rcx
+ and rbx, rcx
+ and rbp, rcx
+ add r9, rsi
+ mov rsi, 1152921504606846975
+ adc r10, rbx
+ adc r11, 0
+ adc r12, rbp
+ and r12, rsi
+ ; Store result
+ mov QWORD PTR [r8], r9
+ mov QWORD PTR [r8+8], r10
+ mov QWORD PTR [r8+16], r11
+ mov QWORD PTR [r8+24], r12
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+sc_reduce_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+sc_muladd_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov r10, r8
+ mov r8, rcx
+ mov r11, r9
+ mov r9, rdx
+ ; Multiply
+ ; A[0] * B[0]
+ mov rdx, QWORD PTR [r10]
+ mulx r13, r12, QWORD PTR [r9]
+ ; A[2] * B[0]
+ mulx r15, r14, QWORD PTR [r9+16]
+ ; A[1] * B[0]
+ mulx rcx, rax, QWORD PTR [r9+8]
+ xor rbp, rbp
+ adcx r13, rax
+ ; A[3] * B[1]
+ mov rdx, QWORD PTR [r10+8]
+ mulx rsi, rdi, QWORD PTR [r9+24]
+ adcx r14, rcx
+ ; A[0] * B[1]
+ mulx rcx, rax, QWORD PTR [r9]
+ adox r13, rax
+ ; A[2] * B[1]
+ mulx rbx, rax, QWORD PTR [r9+16]
+ adox r14, rcx
+ adcx r15, rax
+ ; A[1] * B[2]
+ mov rdx, QWORD PTR [r10+16]
+ mulx rcx, rax, QWORD PTR [r9+8]
+ adcx rdi, rbx
+ adox r15, rax
+ adcx rsi, rbp
+ adox rdi, rcx
+ ; A[0] * B[2]
+ mulx rcx, rax, QWORD PTR [r9]
+ adox rsi, rbp
+ xor rbx, rbx
+ adcx r14, rax
+ ; A[1] * B[1]
+ mov rdx, QWORD PTR [r10+8]
+ mulx rax, rdx, QWORD PTR [r9+8]
+ adcx r15, rcx
+ adox r14, rdx
+ ; A[1] * B[3]
+ mov rdx, QWORD PTR [r10+24]
+ adox r15, rax
+ mulx rcx, rax, QWORD PTR [r9+8]
+ adcx rdi, rax
+ ; A[2] * B[2]
+ mov rdx, QWORD PTR [r10+16]
+ mulx rax, rdx, QWORD PTR [r9+16]
+ adcx rsi, rcx
+ adox rdi, rdx
+ ; A[3] * B[3]
+ mov rdx, QWORD PTR [r10+24]
+ adox rsi, rax
+ mulx rcx, rax, QWORD PTR [r9+24]
+ adox rbx, rbp
+ adcx rbx, rax
+ ; A[0] * B[3]
+ mulx rax, rdx, QWORD PTR [r9]
+ adcx rbp, rcx
+ xor rcx, rcx
+ adcx r15, rdx
+ ; A[3] * B[0]
+ mov rdx, QWORD PTR [r9+24]
+ adcx rdi, rax
+ mulx rax, rdx, QWORD PTR [r10]
+ adox r15, rdx
+ adox rdi, rax
+ ; A[3] * B[2]
+ mov rdx, QWORD PTR [r9+24]
+ mulx rax, rdx, QWORD PTR [r10+16]
+ adcx rsi, rdx
+ ; A[2] * B[3]
+ mov rdx, QWORD PTR [r10+24]
+ adcx rbx, rax
+ mulx rdx, rax, QWORD PTR [r9+16]
+ adcx rbp, rcx
+ adox rsi, rax
+ adox rbx, rdx
+ adox rbp, rcx
+ ; Add c to a * b
+ add r12, QWORD PTR [r11]
+ adc r13, QWORD PTR [r11+8]
+ adc r14, QWORD PTR [r11+16]
+ adc r15, QWORD PTR [r11+24]
+ adc rdi, 0
+ adc rsi, 0
+ adc rbx, 0
+ adc rbp, 0
+ mov rax, rbp
+ mov rcx, 1152921504606846975
+ shr rax, 56
+ shld rbp, rbx, 4
+ shld rbx, rsi, 4
+ shld rsi, rdi, 4
+ shld rdi, r15, 4
+ and r15, rcx
+ and rbp, rcx
+ ; Add order times bits 504..507
+ sub rbx, rax
+ sbb rbp, 0
+ mov rdx, 16942830013509034793
+ mulx rcx, r9, rax
+ mov rdx, 12100500283911187475
+ add rsi, r9
+ mulx r10, r9, rax
+ adc rcx, 0
+ add rdi, r9
+ adc rsi, r10
+ adc rbx, rcx
+ adc rbp, 0
+ ; Sub product of top 4 words and order
+ mov rdx, 12100500283911187475
+ mulx rax, rcx, rdi
+ add r12, rcx
+ adc r13, rax
+ mulx rax, rcx, rbx
+ adc r14, rcx
+ adc r15, rax
+ mov r9, 0
+ adc r9, 0
+ mulx rax, rcx, rsi
+ add r13, rcx
+ adc r14, rax
+ mulx rax, rcx, rbp
+ adc r15, rcx
+ adc r9, rax
+ mov rdx, 16942830013509034793
+ mulx rax, rcx, rdi
+ add r13, rcx
+ adc r14, rax
+ mulx rax, rcx, rbx
+ adc r15, rcx
+ adc r9, rax
+ mov r10, 0
+ adc r10, 0
+ mulx rax, rcx, rsi
+ add r14, rcx
+ adc r15, rax
+ mulx rax, rcx, rbp
+ adc r9, rcx
+ adc r10, rax
+ sub r14, rdi
+ mov rdi, r9
+ sbb r15, rsi
+ mov rsi, r10
+ sbb rdi, rbx
+ sbb rsi, rbp
+ mov rax, rsi
+ sar rax, 57
+ ; Conditionally subtract order starting at bit 125
+ mov r9, 11529215046068469760
+ mov r10, 14628338529006959229
+ mov r11, 187989257525064602
+ mov rcx, 144115188075855872
+ and r9, rax
+ and r10, rax
+ and r11, rax
+ and rcx, rax
+ add r13, r9
+ adc r14, r10
+ adc r15, r11
+ adc rdi, 0
+ adc rsi, rcx
+ ; Move bits 252-376 to own registers
+ mov rax, 1152921504606846975
+ shld rsi, rdi, 4
+ shld rdi, r15, 4
+ and r15, rax
+ ; Sub product of top 2 words and order
+ ; * -5812631a5cf5d3ed
+ mov rdx, 12100500283911187475
+ mulx rax, r11, rdi
+ mov r9, 0
+ add r12, r11
+ adc r13, rax
+ mulx rax, r11, rsi
+ adc r9, 0
+ add r13, r11
+ adc r9, rax
+ ; * -14def9dea2f79cd7
+ mov rdx, 16942830013509034793
+ mulx rax, r11, rdi
+ mov r10, 0
+ add r13, r11
+ adc r14, rax
+ mulx rax, r11, rsi
+ adc r10, 0
+ add r14, r11
+ adc r10, rax
+ ; Add overflows at 2 * 64
+ mov rcx, 1152921504606846975
+ and r15, rcx
+ add r14, r9
+ adc r15, r10
+ ; Subtract top at 2 * 64
+ sub r14, rdi
+ sbb r15, rsi
+ sbb rcx, rcx
+ ; Conditional sub order
+ mov r9, 6346243789798364141
+ mov r10, 1503914060200516822
+ mov r11, 1152921504606846976
+ and r9, rcx
+ and r10, rcx
+ and r11, rcx
+ add r12, r9
+ mov r9, 1152921504606846975
+ adc r13, r10
+ adc r14, 0
+ adc r15, r11
+ and r15, r9
+ ; Store result
+ mov QWORD PTR [r8], r12
+ mov QWORD PTR [r8+8], r13
+ mov QWORD PTR [r8+16], r14
+ mov QWORD PTR [r8+24], r15
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+sc_muladd_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_prime DWORD 03ffffedh, 03ffffffh, 03ffffffh, 03ffffffh
+ DWORD 03ffffffh, 00000000h, 00000000h, 00000000h
+ DWORD 03ffffffh, 03ffffffh, 03ffffffh, 03ffffffh
+ DWORD 001fffffh, 00000000h, 00000000h, 00000000h
+ptr_L_fe_invert_nct_avx2_prime QWORD L_fe_invert_nct_avx2_prime
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_one QWORD 0000000000000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ptr_L_fe_invert_nct_avx2_one QWORD L_fe_invert_nct_avx2_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_all_one DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ptr_L_fe_invert_nct_avx2_all_one QWORD L_fe_invert_nct_avx2_all_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_mask01111 DWORD 00000000h, 00000001h, 00000001h, 00000001h
+ DWORD 00000001h, 00000000h, 00000000h, 00000000h
+ptr_L_fe_invert_nct_avx2_mask01111 QWORD L_fe_invert_nct_avx2_mask01111
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_down_one_dword DWORD 00000001h, 00000002h, 00000003h, 00000004h
+ DWORD 00000005h, 00000006h, 00000007h, 00000007h
+ptr_L_fe_invert_nct_avx2_down_one_dword QWORD L_fe_invert_nct_avx2_down_one_dword
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_neg DWORD 00000000h, 00000000h, 00000000h, 00000000h
+ DWORD 80000000h, 00000000h, 00000000h, 00000000h
+ptr_L_fe_invert_nct_avx2_neg QWORD L_fe_invert_nct_avx2_neg
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_up_one_dword DWORD 00000007h, 00000000h, 00000001h, 00000002h
+ DWORD 00000003h, 00000007h, 00000007h, 00000007h
+ptr_L_fe_invert_nct_avx2_up_one_dword QWORD L_fe_invert_nct_avx2_up_one_dword
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_fe_invert_nct_avx2_mask26 DWORD 03ffffffh, 03ffffffh, 03ffffffh, 03ffffffh
+ DWORD 03ffffffh, 00000000h, 00000000h, 00000000h
+ptr_L_fe_invert_nct_avx2_mask26 QWORD L_fe_invert_nct_avx2_mask26
+_DATA ENDS
+; /* Non-constant time modular inversion.
+; *
+; * @param [out] r Resulting number.
+; * @param [in] a Number to invert.
+; * @param [in] m Modulus.
+; * @return MP_OKAY on success.
+; */
+_TEXT SEGMENT READONLY PARA
+fe_invert_nct_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ mov r8, -19
+ mov r9, -1
+ mov r10, -1
+ mov r11, 9223372036854775807
+ mov r12, QWORD PTR [rdx]
+ mov r13, QWORD PTR [rdx+8]
+ mov r14, QWORD PTR [rdx+16]
+ mov r15, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_prime]
+ vmovupd ymm6, YMMWORD PTR [rbx]
+ vmovupd ymm7, YMMWORD PTR [rbx+32]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_one]
+ vmovupd ymm8, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_mask01111]
+ vmovupd ymm9, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_all_one]
+ vmovupd ymm10, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_down_one_dword]
+ vmovupd ymm11, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_neg]
+ vmovupd ymm12, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_up_one_dword]
+ vmovupd ymm13, YMMWORD PTR [rbx]
+ mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_mask26]
+ vmovupd ymm14, YMMWORD PTR [rbx]
+ vpxor xmm0, xmm0, xmm0
+ vpxor xmm1, xmm1, xmm1
+ vmovdqu ymm2, ymm8
+ vpxor xmm3, xmm3, xmm3
+ test r12b, 1
+ jnz L_fe_invert_nct_avx2_v_even_end
+L_fe_invert_nct_avx2_v_even_start:
+ shrd r12, r13, 1
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shr r15, 1
+ vptest ymm2, ymm8
+ jz L_fe_invert_nct_avx2_v_even_shr1
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+L_fe_invert_nct_avx2_v_even_shr1:
+ vpand ymm4, ymm2, ymm9
+ vpand ymm5, ymm3, ymm10
+ vpermd ymm4, ymm11, ymm4
+ vpsrad ymm2, ymm2, 1
+ vpsrad ymm3, ymm3, 1
+ vpslld ymm5, ymm5, 25
+ vpslld xmm4, xmm4, 25
+ vpaddd ymm2, ymm2, ymm5
+ vpaddd ymm3, ymm3, ymm4
+ test r12b, 1
+ jz L_fe_invert_nct_avx2_v_even_start
+L_fe_invert_nct_avx2_v_even_end:
+L_fe_invert_nct_avx2_uv_start:
+ cmp r11, r15
+ jb L_fe_invert_nct_avx2_uv_v
+ ja L_fe_invert_nct_avx2_uv_u
+ cmp r10, r14
+ jb L_fe_invert_nct_avx2_uv_v
+ ja L_fe_invert_nct_avx2_uv_u
+ cmp r9, r13
+ jb L_fe_invert_nct_avx2_uv_v
+ ja L_fe_invert_nct_avx2_uv_u
+ cmp r8, r12
+ jb L_fe_invert_nct_avx2_uv_v
+L_fe_invert_nct_avx2_uv_u:
+ sub r8, r12
+ sbb r9, r13
+ vpsubd ymm0, ymm0, ymm2
+ sbb r10, r14
+ vpsubd ymm1, ymm1, ymm3
+ sbb r11, r15
+ vptest ymm1, ymm12
+ jz L_fe_invert_nct_avx2_usubv_done_neg
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm1, ymm7
+L_fe_invert_nct_avx2_usubv_done_neg:
+L_fe_invert_nct_avx2_usubv_shr1:
+ shrd r8, r9, 1
+ shrd r9, r10, 1
+ shrd r10, r11, 1
+ shr r11, 1
+ vptest ymm0, ymm8
+ jz L_fe_invert_nct_avx2_usubv_sub_shr1
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm1, ymm7
+L_fe_invert_nct_avx2_usubv_sub_shr1:
+ vpand ymm4, ymm0, ymm9
+ vpand ymm5, ymm1, ymm10
+ vpermd ymm4, ymm11, ymm4
+ vpsrad ymm0, ymm0, 1
+ vpsrad ymm1, ymm1, 1
+ vpslld ymm5, ymm5, 25
+ vpslld xmm4, xmm4, 25
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm4
+ test r8b, 1
+ jz L_fe_invert_nct_avx2_usubv_shr1
+ cmp r8, 1
+ jne L_fe_invert_nct_avx2_uv_start
+ mov rax, r9
+ or rax, r10
+ jne L_fe_invert_nct_avx2_uv_start
+ or rax, r11
+ jne L_fe_invert_nct_avx2_uv_start
+ vpextrd r8d, xmm0, 0
+ vpextrd r10d, xmm0, 1
+ vpextrd r12d, xmm0, 2
+ vpextrd r14d, xmm0, 3
+ vpextrd r9d, xmm1, 0
+ vpextrd r11d, xmm1, 1
+ vpextrd r13d, xmm1, 2
+ vpextrd r15d, xmm1, 3
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpextrd edi, xmm0, 0
+ vpextrd esi, xmm1, 0
+ jmp L_fe_invert_nct_avx2_store_done
+L_fe_invert_nct_avx2_uv_v:
+ sub r12, r8
+ sbb r13, r9
+ vpsubd ymm2, ymm2, ymm0
+ sbb r14, r10
+ vpsubd ymm3, ymm3, ymm1
+ sbb r15, r11
+ vptest ymm3, ymm12
+ jz L_fe_invert_nct_avx2_vsubu_done_neg
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+L_fe_invert_nct_avx2_vsubu_done_neg:
+L_fe_invert_nct_avx2_vsubu_shr1:
+ shrd r12, r13, 1
+ shrd r13, r14, 1
+ shrd r14, r15, 1
+ shr r15, 1
+ vptest ymm2, ymm8
+ jz L_fe_invert_nct_avx2_vsubu_sub_shr1
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+L_fe_invert_nct_avx2_vsubu_sub_shr1:
+ vpand ymm4, ymm2, ymm9
+ vpand ymm5, ymm3, ymm10
+ vpermd ymm4, ymm11, ymm4
+ vpsrad ymm2, ymm2, 1
+ vpsrad ymm3, ymm3, 1
+ vpslld ymm5, ymm5, 25
+ vpslld xmm4, xmm4, 25
+ vpaddd ymm2, ymm2, ymm5
+ vpaddd ymm3, ymm3, ymm4
+ test r12b, 1
+ jz L_fe_invert_nct_avx2_vsubu_shr1
+ cmp r12, 1
+ jne L_fe_invert_nct_avx2_uv_start
+ mov rax, r13
+ or rax, r14
+ jne L_fe_invert_nct_avx2_uv_start
+ or rax, r15
+ jne L_fe_invert_nct_avx2_uv_start
+ vpextrd r8d, xmm2, 0
+ vpextrd r10d, xmm2, 1
+ vpextrd r12d, xmm2, 2
+ vpextrd r14d, xmm2, 3
+ vpextrd r9d, xmm3, 0
+ vpextrd r11d, xmm3, 1
+ vpextrd r13d, xmm3, 2
+ vpextrd r15d, xmm3, 3
+ vextracti128 xmm2, ymm2, 1
+ vextracti128 xmm3, ymm3, 1
+ vpextrd edi, xmm2, 0
+ vpextrd esi, xmm3, 0
+L_fe_invert_nct_avx2_store_done:
+ mov eax, r8d
+ and r8d, 67108863
+ sar eax, 26
+ add r9d, eax
+ mov eax, r9d
+ and r9d, 67108863
+ sar eax, 26
+ add r10d, eax
+ mov eax, r10d
+ and r10d, 67108863
+ sar eax, 26
+ add r11d, eax
+ mov eax, r11d
+ and r11d, 67108863
+ sar eax, 26
+ add r12d, eax
+ mov eax, r12d
+ and r12d, 67108863
+ sar eax, 26
+ add r13d, eax
+ mov eax, r13d
+ and r13d, 67108863
+ sar eax, 26
+ add r14d, eax
+ mov eax, r14d
+ and r14d, 67108863
+ sar eax, 26
+ add r15d, eax
+ mov eax, r15d
+ and r15d, 67108863
+ sar eax, 26
+ add edi, eax
+ mov eax, edi
+ and edi, 67108863
+ sar eax, 26
+ add esi, eax
+ movsxd r9, r9d
+ movsxd r11, r11d
+ movsxd r13, r13d
+ movsxd r15, r15d
+ movsxd rsi, esi
+ shl r9, 26
+ shl r11, 26
+ shl r13, 26
+ shl r15, 26
+ shl rsi, 26
+ movsxd r8, r8d
+ add r8, r9
+ movsxd r10, r10d
+ adc r10, r11
+ movsxd r12, r12d
+ adc r12, r13
+ movsxd r14, r14d
+ adc r14, r15
+ movsxd rdi, edi
+ adc rdi, rsi
+ jge L_fe_invert_nct_avx2_uv_start_no_add_prime
+ mov r9, 4503599627370477
+ mov r11, 4503599627370495
+ mov r13, 4503599627370495
+ mov r15, 4503599627370495
+ mov rsi, 140737488355327
+ add r8, r9
+ add r10, r11
+ add r12, r13
+ add r14, r15
+ add rdi, rsi
+ mov rax, 4503599627370495
+ mov r9, r8
+ and r8, rax
+ sar r9, 52
+ add r10, r9
+ mov r11, r10
+ and r10, rax
+ sar r11, 52
+ add r12, r11
+ mov r13, r12
+ and r12, rax
+ sar r13, 52
+ add r14, r13
+ mov r15, r14
+ and r14, rax
+ sar r15, 52
+ add rdi, r15
+L_fe_invert_nct_avx2_uv_start_no_add_prime:
+ mov r9, r10
+ mov r11, r12
+ mov r13, r14
+ shl r9, 52
+ sar r10, 12
+ shl r11, 40
+ sar r12, 24
+ shl r13, 28
+ sar r14, 36
+ shl rdi, 16
+ add r8, r9
+ adc r10, r11
+ adc r12, r13
+ adc r14, rdi
+ mov QWORD PTR [rcx], r8
+ mov QWORD PTR [rcx+8], r10
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r14
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+fe_invert_nct_avx2 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am
index 908c43984cd..dba03978c44 100644
--- a/wolfcrypt/src/include.am
+++ b/wolfcrypt/src/include.am
@@ -18,9 +18,16 @@ EXTRA_DIST += wolfcrypt/src/asm.c
EXTRA_DIST += wolfcrypt/src/aes_asm.asm
EXTRA_DIST += wolfcrypt/src/aes_x86_64_asm.asm
EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
+EXTRA_DIST += wolfcrypt/src/aes_gcm_x86_asm.asm
EXTRA_DIST += wolfcrypt/src/aes_xts_asm.asm
EXTRA_DIST += wolfcrypt/src/chacha_asm.asm
EXTRA_DIST += wolfcrypt/src/poly1305_asm.asm
+EXTRA_DIST += wolfcrypt/src/fe_x25519_asm.asm
+EXTRA_DIST += wolfcrypt/src/sha256_asm.asm
+EXTRA_DIST += wolfcrypt/src/sha512_asm.asm
+EXTRA_DIST += wolfcrypt/src/sha3_asm.asm
+EXTRA_DIST += wolfcrypt/src/wc_mlkem_asm.asm
+EXTRA_DIST += wolfcrypt/src/wc_mldsa_asm.asm
EXTRA_DIST += wolfcrypt/src/wc_dsp.c
EXTRA_DIST += wolfcrypt/src/sp_dsp32.c
EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm
diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm
index 95c3764acac..ae34937a184 100644
--- a/wolfcrypt/src/poly1305_asm.asm
+++ b/wolfcrypt/src/poly1305_asm.asm
@@ -598,16 +598,14 @@ poly1305_setkey_avx2 ENDP
_TEXT ENDS
_DATA SEGMENT
ALIGN 16
-L_poly1305_avx2_blocks_mask QWORD \
- 0000000003ffffffh, 0000000003ffffffh,
- 0000000003ffffffh, 0000000003ffffffh
+L_poly1305_avx2_blocks_mask QWORD 0000000003ffffffh, 0000000003ffffffh
+ QWORD 0000000003ffffffh, 0000000003ffffffh
ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask
_DATA ENDS
_DATA SEGMENT
ALIGN 16
-L_poly1305_avx2_blocks_hibit QWORD \
- 0000000001000000h, 0000000001000000h,
- 0000000001000000h, 0000000001000000h
+L_poly1305_avx2_blocks_hibit QWORD 0000000001000000h, 0000000001000000h
+ QWORD 0000000001000000h, 0000000001000000h
ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit
_DATA ENDS
_TEXT SEGMENT READONLY PARA
@@ -736,15 +734,15 @@ L_poly1305_avx2_blocks_mul_5:
vpaddq ymm12, ymm8, ymm12
vpaddq ymm13, ymm9, ymm13
; Store powers of r and multiple of 5 for use in multiply.
- vmovdqa YMMWORD PTR [rbx], ymm10
- vmovdqa YMMWORD PTR [rbx+32], ymm11
- vmovdqa YMMWORD PTR [rbx+64], ymm12
- vmovdqa YMMWORD PTR [rbx+96], ymm13
- vmovdqa YMMWORD PTR [rcx], ymm5
- vmovdqa YMMWORD PTR [rcx+32], ymm6
- vmovdqa YMMWORD PTR [rcx+64], ymm7
- vmovdqa YMMWORD PTR [rcx+96], ymm8
- vmovdqa YMMWORD PTR [rcx+128], ymm9
+ vmovdqu YMMWORD PTR [rbx], ymm10
+ vmovdqu YMMWORD PTR [rbx+32], ymm11
+ vmovdqu YMMWORD PTR [rbx+64], ymm12
+ vmovdqu YMMWORD PTR [rbx+96], ymm13
+ vmovdqu YMMWORD PTR [rcx], ymm5
+ vmovdqu YMMWORD PTR [rcx+32], ymm6
+ vmovdqu YMMWORD PTR [rcx+64], ymm7
+ vmovdqu YMMWORD PTR [rcx+96], ymm8
+ vmovdqu YMMWORD PTR [rcx+128], ymm9
vmovdqu ymm14, YMMWORD PTR [r13]
; If not finished then loop over data
cmp BYTE PTR [rdi+616], 1
diff --git a/wolfcrypt/src/sha256_asm.asm b/wolfcrypt/src/sha256_asm.asm
new file mode 100644
index 00000000000..bd4aa0deb8f
--- /dev/null
+++ b/wolfcrypt/src/sha256_asm.asm
@@ -0,0 +1,23463 @@
+; /* sha256_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+IFDEF WOLFSSL_X86_64_BUILD
+_DATA SEGMENT
+ALIGN 16
+L_sse2_sha256_sha_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_sse2_sha256_sha_k QWORD L_sse2_sha256_sha_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sse2_sha256_shuf_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_sse2_sha256_shuf_mask QWORD L_sse2_sha256_shuf_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_SSE2_Sha PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ mov rax, QWORD PTR [ptr_L_sse2_sha256_sha_k]
+ movdqa xmm10, OWORD PTR L_sse2_sha256_shuf_mask
+ movq xmm1, QWORD PTR [rcx]
+ movq xmm2, QWORD PTR [rcx+8]
+ movhpd xmm1, QWORD PTR [rcx+16]
+ movhpd xmm2, QWORD PTR [rcx+24]
+ pshufd xmm1, xmm1, 27
+ pshufd xmm2, xmm2, 27
+ movdqu xmm3, OWORD PTR [rdx]
+ movdqu xmm4, OWORD PTR [rdx+16]
+ movdqu xmm5, OWORD PTR [rdx+32]
+ movdqu xmm6, OWORD PTR [rdx+48]
+ pshufb xmm3, xmm10
+ movdqa xmm8, xmm1
+ movdqa xmm9, xmm2
+ ; Rounds: 0-3
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 4-7
+ pshufb xmm4, xmm10
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+16]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 8-11
+ pshufb xmm5, xmm10
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+32]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 12-15
+ pshufb xmm6, xmm10
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+48]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 16-19
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+64]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 20-23
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+80]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 24-27
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+96]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 28-31
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+112]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 32-35
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+128]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 36-39
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+144]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 40-43
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+160]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 44-47
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+176]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 48-51
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+192]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 52-63
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+208]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+224]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+240]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ paddd xmm1, xmm8
+ paddd xmm2, xmm9
+ pshufd xmm1, xmm1, 27
+ pshufd xmm2, xmm2, 27
+ movq QWORD PTR [rcx], xmm1
+ movq QWORD PTR [rcx+8], xmm2
+ movhpd QWORD PTR [rcx+16], xmm1
+ movhpd QWORD PTR [rcx+24], xmm2
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+Transform_Sha256_SSE2_Sha ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_SSE2_Sha_Len PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ mov rax, QWORD PTR [ptr_L_sse2_sha256_sha_k]
+ movdqa xmm10, OWORD PTR L_sse2_sha256_shuf_mask
+ movq xmm1, QWORD PTR [rcx]
+ movq xmm2, QWORD PTR [rcx+8]
+ movhpd xmm1, QWORD PTR [rcx+16]
+ movhpd xmm2, QWORD PTR [rcx+24]
+ pshufd xmm1, xmm1, 27
+ pshufd xmm2, xmm2, 27
+ ; Start of loop processing a block
+L_sha256_sha_len_sse2_start:
+ movdqu xmm3, OWORD PTR [rdx]
+ movdqu xmm4, OWORD PTR [rdx+16]
+ movdqu xmm5, OWORD PTR [rdx+32]
+ movdqu xmm6, OWORD PTR [rdx+48]
+ pshufb xmm3, xmm10
+ movdqa xmm8, xmm1
+ movdqa xmm9, xmm2
+ ; Rounds: 0-3
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 4-7
+ pshufb xmm4, xmm10
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+16]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 8-11
+ pshufb xmm5, xmm10
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+32]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 12-15
+ pshufb xmm6, xmm10
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+48]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 16-19
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+64]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 20-23
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+80]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 24-27
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+96]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 28-31
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+112]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 32-35
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+128]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 36-39
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+144]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 40-43
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+160]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 44-47
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+176]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm5, 4
+ paddd xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 48-51
+ movdqa xmm0, xmm3
+ paddd xmm0, OWORD PTR [rax+192]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm6, 4
+ paddd xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ pshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 52-63
+ movdqa xmm0, xmm4
+ paddd xmm0, OWORD PTR [rax+208]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm4
+ palignr xmm7, xmm3, 4
+ paddd xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ movdqa xmm0, xmm5
+ paddd xmm0, OWORD PTR [rax+224]
+ sha256rnds2 xmm2, xmm1, xmm0
+ movdqa xmm7, xmm5
+ palignr xmm7, xmm4, 4
+ paddd xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ movdqa xmm0, xmm6
+ paddd xmm0, OWORD PTR [rax+240]
+ sha256rnds2 xmm2, xmm1, xmm0
+ pshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ add rdx, 64
+ sub r8d, 64
+ paddd xmm1, xmm8
+ paddd xmm2, xmm9
+ jnz L_sha256_sha_len_sse2_start
+ pshufd xmm1, xmm1, 27
+ pshufd xmm2, xmm2, 27
+ movq QWORD PTR [rcx], xmm1
+ movq QWORD PTR [rcx+8], xmm2
+ movhpd QWORD PTR [rcx+16], xmm1
+ movhpd QWORD PTR [rcx+24], xmm2
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+Transform_Sha256_SSE2_Sha_Len ENDP
+_TEXT ENDS
+IFDEF HAVE_INTEL_AVX1
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_avx1_sha256_k QWORD L_avx1_sha256_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ptr_L_avx1_sha256_shuf_00BA QWORD L_avx1_sha256_shuf_00BA
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ptr_L_avx1_sha256_shuf_DC00 QWORD L_avx1_sha256_shuf_DC00
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_avx1_sha256_flip_mask QWORD L_avx1_sha256_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ sub rsp, 192
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ mov rbp, QWORD PTR [ptr_L_avx1_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx1_sha256_flip_mask
+ vmovdqa xmm11, OWORD PTR L_avx1_sha256_shuf_00BA
+ vmovdqa xmm12, OWORD PTR L_avx1_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ mov ebx, r9d
+ mov edx, r12d
+ xor ebx, r10d
+ ; set_w_k_xfer_4: 0
+ vpaddd xmm4, xmm0, OWORD PTR [rbp]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+16]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+32]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+48]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 4
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+64]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+80]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+96]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+112]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 8
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+128]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+144]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+160]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+176]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 12
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+192]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+208]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+224]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+240]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; rnd_all_4: 0-3
+ add r15d, DWORD PTR [rsp]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+4]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+8]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+12]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 1-4
+ add r11d, DWORD PTR [rsp+16]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+20]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+24]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+28]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 2-5
+ add r15d, DWORD PTR [rsp+32]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+36]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+40]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+44]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 3-6
+ add r11d, DWORD PTR [rsp+48]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+52]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+56]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+60]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ add DWORD PTR [rdi], r8d
+ add DWORD PTR [rdi+4], r9d
+ add DWORD PTR [rdi+8], r10d
+ add DWORD PTR [rdi+12], r11d
+ add DWORD PTR [rdi+16], r12d
+ add DWORD PTR [rdi+20], r13d
+ add DWORD PTR [rdi+24], r14d
+ add DWORD PTR [rdi+28], r15d
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ add rsp, 192
+ pop rsi
+ pop rdi
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+ sub rsp, 196
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ mov DWORD PTR [rsp+64], ebp
+ mov rbp, QWORD PTR [ptr_L_avx1_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx1_sha256_flip_mask
+ vmovdqa xmm11, OWORD PTR L_avx1_sha256_shuf_00BA
+ vmovdqa xmm12, OWORD PTR L_avx1_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; Start of loop processing a block
+L_sha256_len_avx1_start:
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ mov ebx, r9d
+ mov edx, r12d
+ xor ebx, r10d
+ ; set_w_k_xfer_4: 0
+ vpaddd xmm4, xmm0, OWORD PTR [rbp]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+16]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+32]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+48]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 4
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+64]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+80]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+96]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+112]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 8
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+128]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+144]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+160]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+176]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm1, xmm0, 4
+ vpalignr xmm4, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm2, xmm1, 4
+ vpalignr xmm4, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+16]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+20]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+24]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+28]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm3, xmm2, 4
+ vpalignr xmm4, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+32]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+36]
+ xor ecx, r13d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+40]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+44]
+ xor ecx, r11d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr xmm5, xmm0, xmm3, 4
+ vpalignr xmm4, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+48]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld xmm8, xmm5, 18
+ vpslld xmm9, xmm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor xmm6, xmm7, xmm6
+ vpor xmm8, xmm9, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+52]
+ xor ecx, r9d
+ vpsrld xmm9, xmm5, 3
+ vpxor xmm6, xmm8, xmm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor xmm5, xmm9, xmm6
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld xmm8, xmm6, 10
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+56]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor xmm6, xmm7, xmm6
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+60]
+ xor ecx, r15d
+ vpsrlq xmm8, xmm6, 17
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld xmm9, xmm6, 10
+ vpxor xmm8, xmm7, xmm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor xmm9, xmm8, xmm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 12
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+192]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+208]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+224]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+240]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; rnd_all_4: 0-3
+ add r15d, DWORD PTR [rsp]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+4]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+8]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+12]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 1-4
+ add r11d, DWORD PTR [rsp+16]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+20]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+24]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+28]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 2-5
+ add r15d, DWORD PTR [rsp+32]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+36]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+40]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+44]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 3-6
+ add r11d, DWORD PTR [rsp+48]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+52]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+56]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+60]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ add rsi, 64
+ sub DWORD PTR [rsp+64], 64
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ jnz L_sha256_len_avx1_start
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ add rsp, 196
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX1_Len ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_avx1_rorx_sha256_k QWORD L_avx1_rorx_sha256_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ptr_L_avx1_rorx_sha256_shuf_00BA QWORD L_avx1_rorx_sha256_shuf_00BA
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ptr_L_avx1_rorx_sha256_shuf_DC00 QWORD L_avx1_rorx_sha256_shuf_DC00
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_avx1_rorx_sha256_flip_mask QWORD L_avx1_rorx_sha256_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1_RORX PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ sub rsp, 192
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ mov rbp, QWORD PTR [ptr_L_avx1_rorx_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx1_rorx_sha256_flip_mask
+ vmovdqa xmm11, OWORD PTR L_avx1_rorx_sha256_shuf_00BA
+ vmovdqa xmm12, OWORD PTR L_avx1_rorx_sha256_shuf_DC00
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; set_w_k_xfer_4: 0
+ vpaddd xmm4, xmm0, OWORD PTR [rbp]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+16]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+32]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+48]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ mov ebx, r9d
+ rorx edx, r12d, 6
+ xor ebx, r10d
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 4
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+64]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+80]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+96]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+112]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 8
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+128]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+144]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+160]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+176]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 12
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+192]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+208]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+224]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+240]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ xor eax, eax
+ ; rnd_all_4: 0-3
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ add r8d, eax
+ add r15d, DWORD PTR [rsp]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+4]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ add r14d, eax
+ add r13d, DWORD PTR [rsp+8]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+12]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ ; rnd_all_4: 1-4
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ add r12d, eax
+ add r11d, DWORD PTR [rsp+16]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+20]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ add r10d, eax
+ add r9d, DWORD PTR [rsp+24]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+28]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ ; rnd_all_4: 2-5
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ add r8d, eax
+ add r15d, DWORD PTR [rsp+32]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+36]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ add r14d, eax
+ add r13d, DWORD PTR [rsp+40]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+44]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ ; rnd_all_4: 3-6
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ add r12d, eax
+ add r11d, DWORD PTR [rsp+48]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+52]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ add r10d, eax
+ add r9d, DWORD PTR [rsp+56]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+60]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ add r8d, eax
+ add DWORD PTR [rdi], r8d
+ add DWORD PTR [rdi+4], r9d
+ add DWORD PTR [rdi+8], r10d
+ add DWORD PTR [rdi+12], r11d
+ add DWORD PTR [rdi+16], r12d
+ add DWORD PTR [rdi+20], r13d
+ add DWORD PTR [rdi+24], r14d
+ add DWORD PTR [rdi+28], r15d
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ add rsp, 192
+ pop rsi
+ pop rdi
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX1_RORX ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1_RORX_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+ sub rsp, 196
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ mov DWORD PTR [rsp+64], ebp
+ mov rbp, QWORD PTR [ptr_L_avx1_rorx_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx1_rorx_sha256_flip_mask
+ vmovdqa xmm11, OWORD PTR L_avx1_rorx_sha256_shuf_00BA
+ vmovdqa xmm12, OWORD PTR L_avx1_rorx_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; Start of loop processing a block
+L_sha256_len_avx1_len_rorx_start:
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ ; set_w_k_xfer_4: 0
+ vpaddd xmm4, xmm0, OWORD PTR [rbp]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+16]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+32]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+48]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ mov ebx, r9d
+ rorx edx, r12d, 6
+ xor ebx, r10d
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 4
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+64]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+80]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+96]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+112]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 8
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+128]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+144]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+160]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+176]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr xmm4, xmm3, xmm2, 4
+ vpalignr xmm5, xmm1, xmm0, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm3, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm0
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm0, xmm9, xmm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 4-7
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+16]
+ vpalignr xmm4, xmm0, xmm3, 4
+ vpalignr xmm5, xmm2, xmm1, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+20]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm0, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+24]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm1
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+28]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm1, xmm9, xmm4
+ ; msg_sched done: 4-7
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+32]
+ vpalignr xmm4, xmm1, xmm0, 4
+ vpalignr xmm5, xmm3, xmm2, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+36]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpshufd xmm6, xmm1, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+40]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm2
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+44]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vpaddd xmm2, xmm9, xmm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 12-15
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+48]
+ vpalignr xmm4, xmm2, xmm1, 4
+ vpalignr xmm5, xmm0, xmm3, 4
+ ; rnd_0: 1 - 2
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld xmm6, xmm5, 7
+ vpslld xmm7, xmm5, 25
+ ; rnd_0: 3 - 4
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld xmm8, xmm5, 3
+ vpor xmm7, xmm7, xmm6
+ ; rnd_0: 5 - 7
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+52]
+ vpsrld xmm6, xmm5, 18
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpslld xmm5, xmm5, 14
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpxor xmm7, xmm7, xmm5
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor xmm7, xmm7, xmm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpshufd xmm6, xmm2, 250
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ vpxor xmm5, xmm7, xmm8
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrld xmm8, xmm6, 10
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+56]
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpsrlq xmm6, xmm6, 17
+ vpaddd xmm4, xmm4, xmm3
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd xmm4, xmm4, xmm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpxor xmm8, xmm8, xmm6
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufb xmm8, xmm8, xmm11
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpaddd xmm4, xmm4, xmm8
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+60]
+ vpshufd xmm6, xmm4, 80
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpsrld xmm9, xmm6, 10
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpsrlq xmm7, xmm6, 19
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpsrlq xmm6, xmm6, 17
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpxor xmm6, xmm6, xmm7
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ vpxor xmm9, xmm9, xmm6
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ vpshufb xmm9, xmm9, xmm12
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vpaddd xmm3, xmm9, xmm4
+ ; msg_sched done: 12-15
+ ; set_w_k_xfer_4: 12
+ vpaddd xmm4, xmm0, OWORD PTR [rbp+192]
+ vpaddd xmm5, xmm1, OWORD PTR [rbp+208]
+ vmovdqu OWORD PTR [rsp], xmm4
+ vmovdqu OWORD PTR [rsp+16], xmm5
+ vpaddd xmm6, xmm2, OWORD PTR [rbp+224]
+ vpaddd xmm7, xmm3, OWORD PTR [rbp+240]
+ vmovdqu OWORD PTR [rsp+32], xmm6
+ vmovdqu OWORD PTR [rsp+48], xmm7
+ xor eax, eax
+ xor ecx, ecx
+ ; rnd_all_4: 0-3
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ add r8d, eax
+ add r15d, DWORD PTR [rsp]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+4]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ add r14d, eax
+ add r13d, DWORD PTR [rsp+8]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+12]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ ; rnd_all_4: 1-4
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ add r12d, eax
+ add r11d, DWORD PTR [rsp+16]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+20]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ add r10d, eax
+ add r9d, DWORD PTR [rsp+24]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+28]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ ; rnd_all_4: 2-5
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ add r8d, eax
+ add r15d, DWORD PTR [rsp+32]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+36]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ add r10d, r14d
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ add r14d, eax
+ add r13d, DWORD PTR [rsp+40]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+44]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ add r8d, r12d
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ ; rnd_all_4: 3-6
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ add r12d, eax
+ add r11d, DWORD PTR [rsp+48]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+52]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ add r14d, r10d
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ add r10d, eax
+ add r9d, DWORD PTR [rsp+56]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+60]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ add r12d, r8d
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ add r8d, eax
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ add rsi, 64
+ sub DWORD PTR [rsp+64], 64
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ jnz L_sha256_len_avx1_len_rorx_start
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ add rsp, 196
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX1_RORX_Len ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_sha_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_avx1_sha256_sha_k QWORD L_avx1_sha256_sha_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha256_shuf_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_avx1_sha256_shuf_mask QWORD L_avx1_sha256_shuf_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1_Sha PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ mov rax, QWORD PTR [ptr_L_avx1_sha256_sha_k]
+ vmovdqa xmm10, OWORD PTR L_avx1_sha256_shuf_mask
+ vmovq xmm1, QWORD PTR [rcx]
+ vmovq xmm2, QWORD PTR [rcx+8]
+ vmovhpd xmm1, xmm1, QWORD PTR [rcx+16]
+ vmovhpd xmm2, xmm2, QWORD PTR [rcx+24]
+ vpshufd xmm1, xmm1, 27
+ vpshufd xmm2, xmm2, 27
+ vmovdqu xmm3, OWORD PTR [rdx]
+ vmovdqu xmm4, OWORD PTR [rdx+16]
+ vmovdqu xmm5, OWORD PTR [rdx+32]
+ vmovdqu xmm6, OWORD PTR [rdx+48]
+ vpshufb xmm3, xmm3, xmm10
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm9, xmm2
+ ; Rounds: 0-3
+ vpaddd xmm0, xmm3, OWORD PTR [rax]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 4-7
+ vpshufb xmm4, xmm4, xmm10
+ vpaddd xmm0, xmm4, OWORD PTR [rax+16]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 8-11
+ vpshufb xmm5, xmm5, xmm10
+ vpaddd xmm0, xmm5, OWORD PTR [rax+32]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 12-15
+ vpshufb xmm6, xmm6, xmm10
+ vpaddd xmm0, xmm6, OWORD PTR [rax+48]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 16-19
+ vpaddd xmm0, xmm3, OWORD PTR [rax+64]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 20-23
+ vpaddd xmm0, xmm4, OWORD PTR [rax+80]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 24-27
+ vpaddd xmm0, xmm5, OWORD PTR [rax+96]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 28-31
+ vpaddd xmm0, xmm6, OWORD PTR [rax+112]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 32-35
+ vpaddd xmm0, xmm3, OWORD PTR [rax+128]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 36-39
+ vpaddd xmm0, xmm4, OWORD PTR [rax+144]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 40-43
+ vpaddd xmm0, xmm5, OWORD PTR [rax+160]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 44-47
+ vpaddd xmm0, xmm6, OWORD PTR [rax+176]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 48-51
+ vpaddd xmm0, xmm3, OWORD PTR [rax+192]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 52-63
+ vpaddd xmm0, xmm4, OWORD PTR [rax+208]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ vpaddd xmm0, xmm5, OWORD PTR [rax+224]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ vpaddd xmm0, xmm6, OWORD PTR [rax+240]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ vpaddd xmm1, xmm1, xmm8
+ vpaddd xmm2, xmm2, xmm9
+ vpshufd xmm1, xmm1, 27
+ vpshufd xmm2, xmm2, 27
+ vmovq QWORD PTR [rcx], xmm1
+ vmovq QWORD PTR [rcx+8], xmm2
+ vmovhpd QWORD PTR [rcx+16], xmm1
+ vmovhpd QWORD PTR [rcx+24], xmm2
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+Transform_Sha256_AVX1_Sha ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX1_Sha_Len PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ mov rax, QWORD PTR [ptr_L_avx1_sha256_sha_k]
+ vmovdqa xmm10, OWORD PTR L_avx1_sha256_shuf_mask
+ vmovq xmm1, QWORD PTR [rcx]
+ vmovq xmm2, QWORD PTR [rcx+8]
+ vmovhpd xmm1, xmm1, QWORD PTR [rcx+16]
+ vmovhpd xmm2, xmm2, QWORD PTR [rcx+24]
+ vpshufd xmm1, xmm1, 27
+ vpshufd xmm2, xmm2, 27
+ ; Start of loop processing a block
+L_sha256_sha_len_avx1_start:
+ vmovdqu xmm3, OWORD PTR [rdx]
+ vmovdqu xmm4, OWORD PTR [rdx+16]
+ vmovdqu xmm5, OWORD PTR [rdx+32]
+ vmovdqu xmm6, OWORD PTR [rdx+48]
+ vpshufb xmm3, xmm3, xmm10
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm9, xmm2
+ ; Rounds: 0-3
+ vpaddd xmm0, xmm3, OWORD PTR [rax]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 4-7
+ vpshufb xmm4, xmm4, xmm10
+ vpaddd xmm0, xmm4, OWORD PTR [rax+16]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 8-11
+ vpshufb xmm5, xmm5, xmm10
+ vpaddd xmm0, xmm5, OWORD PTR [rax+32]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 12-15
+ vpshufb xmm6, xmm6, xmm10
+ vpaddd xmm0, xmm6, OWORD PTR [rax+48]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 16-19
+ vpaddd xmm0, xmm3, OWORD PTR [rax+64]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 20-23
+ vpaddd xmm0, xmm4, OWORD PTR [rax+80]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 24-27
+ vpaddd xmm0, xmm5, OWORD PTR [rax+96]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 28-31
+ vpaddd xmm0, xmm6, OWORD PTR [rax+112]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 32-35
+ vpaddd xmm0, xmm3, OWORD PTR [rax+128]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 36-39
+ vpaddd xmm0, xmm4, OWORD PTR [rax+144]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm3, xmm4
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 40-43
+ vpaddd xmm0, xmm5, OWORD PTR [rax+160]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm4, xmm5
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 44-47
+ vpaddd xmm0, xmm6, OWORD PTR [rax+176]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm6, xmm5, 4
+ vpaddd xmm3, xmm3, xmm7
+ sha256msg2 xmm3, xmm6
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm5, xmm6
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 48-51
+ vpaddd xmm0, xmm3, OWORD PTR [rax+192]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm3, xmm6, 4
+ vpaddd xmm4, xmm4, xmm7
+ sha256msg2 xmm4, xmm3
+ vpshufd xmm0, xmm0, 14
+ sha256msg1 xmm6, xmm3
+ sha256rnds2 xmm1, xmm2, xmm0
+ ; Rounds: 52-63
+ vpaddd xmm0, xmm4, OWORD PTR [rax+208]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm4, xmm3, 4
+ vpaddd xmm5, xmm5, xmm7
+ sha256msg2 xmm5, xmm4
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ vpaddd xmm0, xmm5, OWORD PTR [rax+224]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpalignr xmm7, xmm5, xmm4, 4
+ vpaddd xmm6, xmm6, xmm7
+ sha256msg2 xmm6, xmm5
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ vpaddd xmm0, xmm6, OWORD PTR [rax+240]
+ sha256rnds2 xmm2, xmm1, xmm0
+ vpshufd xmm0, xmm0, 14
+ sha256rnds2 xmm1, xmm2, xmm0
+ add rdx, 64
+ sub r8d, 64
+ vpaddd xmm1, xmm1, xmm8
+ vpaddd xmm2, xmm2, xmm9
+ jnz L_sha256_sha_len_avx1_start
+ vpshufd xmm1, xmm1, 27
+ vpshufd xmm2, xmm2, 27
+ vmovq QWORD PTR [rcx], xmm1
+ vmovq QWORD PTR [rcx+8], xmm2
+ vmovhpd QWORD PTR [rcx+16], xmm1
+ vmovhpd QWORD PTR [rcx+24], xmm2
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+Transform_Sha256_AVX1_Sha_Len ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_avx2_sha256_k QWORD L_avx2_sha256_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ptr_L_avx2_sha256_shuf_00BA QWORD L_avx2_sha256_shuf_00BA
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ptr_L_avx2_sha256_shuf_DC00 QWORD L_avx2_sha256_shuf_DC00
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_avx2_sha256_flip_mask QWORD L_avx2_sha256_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ sub rsp, 640
+ vmovdqu OWORD PTR [rsp+512], xmm6
+ vmovdqu OWORD PTR [rsp+528], xmm7
+ vmovdqu OWORD PTR [rsp+544], xmm8
+ vmovdqu OWORD PTR [rsp+560], xmm9
+ vmovdqu OWORD PTR [rsp+576], xmm10
+ vmovdqu OWORD PTR [rsp+592], xmm11
+ vmovdqu OWORD PTR [rsp+608], xmm12
+ vmovdqu OWORD PTR [rsp+624], xmm13
+ mov rbp, QWORD PTR [ptr_L_avx2_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx2_sha256_flip_mask
+ vmovdqu ymm11, YMMWORD PTR L_avx2_sha256_shuf_00BA
+ vmovdqu ymm12, YMMWORD PTR L_avx2_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ mov ebx, r9d
+ mov edx, r12d
+ xor ebx, r10d
+ ; set_w_k_xfer_4: 0
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32]
+ vmovdqu YMMWORD PTR [rsp], ymm4
+ vmovdqu YMMWORD PTR [rsp+32], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm4
+ vmovdqu YMMWORD PTR [rsp+96], ymm5
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+32]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+36]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+40]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+44]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 16-19
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+64]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+68]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+72]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+76]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 16-19
+ ; msg_sched: 24-27
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+96]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+100]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+104]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+108]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 24-27
+ ; set_w_k_xfer_4: 4
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+160]
+ vmovdqu YMMWORD PTR [rsp+128], ymm4
+ vmovdqu YMMWORD PTR [rsp+160], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+224]
+ vmovdqu YMMWORD PTR [rsp+192], ymm4
+ vmovdqu YMMWORD PTR [rsp+224], ymm5
+ ; msg_sched: 32-35
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+128]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+132]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+136]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+140]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 32-35
+ ; msg_sched: 40-43
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+160]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+164]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+168]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+172]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 40-43
+ ; msg_sched: 48-51
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+192]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+196]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+200]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+204]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 48-51
+ ; msg_sched: 56-59
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+224]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+228]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+232]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+236]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 56-59
+ ; set_w_k_xfer_4: 8
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+288]
+ vmovdqu YMMWORD PTR [rsp+256], ymm4
+ vmovdqu YMMWORD PTR [rsp+288], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+352]
+ vmovdqu YMMWORD PTR [rsp+320], ymm4
+ vmovdqu YMMWORD PTR [rsp+352], ymm5
+ ; msg_sched: 64-67
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+256]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+260]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+264]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+268]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 64-67
+ ; msg_sched: 72-75
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+288]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+292]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+296]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+300]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 72-75
+ ; msg_sched: 80-83
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+320]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+324]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+328]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+332]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 80-83
+ ; msg_sched: 88-91
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+352]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+356]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+360]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+364]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 88-91
+ ; set_w_k_xfer_4: 12
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+416]
+ vmovdqu YMMWORD PTR [rsp+384], ymm4
+ vmovdqu YMMWORD PTR [rsp+416], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+480]
+ vmovdqu YMMWORD PTR [rsp+448], ymm4
+ vmovdqu YMMWORD PTR [rsp+480], ymm5
+ ; rnd_all_4: 24-27
+ add r15d, DWORD PTR [rsp+384]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+388]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+392]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+396]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 26-29
+ add r11d, DWORD PTR [rsp+416]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+420]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+424]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+428]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 28-31
+ add r15d, DWORD PTR [rsp+448]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+452]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+456]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+460]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 30-33
+ add r11d, DWORD PTR [rsp+480]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+484]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+488]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+492]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ add DWORD PTR [rdi], r8d
+ add DWORD PTR [rdi+4], r9d
+ add DWORD PTR [rdi+8], r10d
+ add DWORD PTR [rdi+12], r11d
+ add DWORD PTR [rdi+16], r12d
+ add DWORD PTR [rdi+20], r13d
+ add DWORD PTR [rdi+24], r14d
+ add DWORD PTR [rdi+28], r15d
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+512]
+ vmovdqu xmm7, OWORD PTR [rsp+528]
+ vmovdqu xmm8, OWORD PTR [rsp+544]
+ vmovdqu xmm9, OWORD PTR [rsp+560]
+ vmovdqu xmm10, OWORD PTR [rsp+576]
+ vmovdqu xmm11, OWORD PTR [rsp+592]
+ vmovdqu xmm12, OWORD PTR [rsp+608]
+ vmovdqu xmm13, OWORD PTR [rsp+624]
+ add rsp, 640
+ pop rsi
+ pop rdi
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX2_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+ sub rsp, 644
+ vmovdqu OWORD PTR [rsp+512], xmm6
+ vmovdqu OWORD PTR [rsp+528], xmm7
+ vmovdqu OWORD PTR [rsp+544], xmm8
+ vmovdqu OWORD PTR [rsp+560], xmm9
+ vmovdqu OWORD PTR [rsp+576], xmm10
+ vmovdqu OWORD PTR [rsp+592], xmm11
+ vmovdqu OWORD PTR [rsp+608], xmm12
+ vmovdqu OWORD PTR [rsp+624], xmm13
+ test bpl, 64
+ mov DWORD PTR [rsp+512], ebp
+ je L_sha256_len_avx2_block
+ vmovdqu ymm0, YMMWORD PTR [rsi]
+ vmovdqu ymm1, YMMWORD PTR [rsi+32]
+ vmovups YMMWORD PTR [rdi+32], ymm0
+ vmovups YMMWORD PTR [rdi+64], ymm1
+ call Transform_Sha256_AVX2
+ add rsi, 64
+ sub DWORD PTR [rsp+512], 64
+ jz L_sha256_len_avx2_done
+L_sha256_len_avx2_block:
+ mov rbp, QWORD PTR [ptr_L_avx2_sha256_k]
+ vmovdqu ymm13, YMMWORD PTR L_avx2_sha256_flip_mask
+ vmovdqu ymm11, YMMWORD PTR L_avx2_sha256_shuf_00BA
+ vmovdqu ymm12, YMMWORD PTR L_avx2_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; Start of loop processing two blocks
+L_sha256_len_avx2_start:
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vmovdqu xmm4, OWORD PTR [rsi+64]
+ vmovdqu xmm5, OWORD PTR [rsi+80]
+ vinserti128 ymm0, ymm0, xmm4, 1
+ vinserti128 ymm1, ymm1, xmm5, 1
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vmovdqu xmm6, OWORD PTR [rsi+96]
+ vmovdqu xmm7, OWORD PTR [rsi+112]
+ vinserti128 ymm2, ymm2, xmm6, 1
+ vinserti128 ymm3, ymm3, xmm7, 1
+ vpshufb ymm2, ymm2, ymm13
+ vpshufb ymm3, ymm3, ymm13
+ mov ebx, r9d
+ mov edx, r12d
+ xor ebx, r10d
+ ; set_w_k_xfer_4: 0
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32]
+ vmovdqu YMMWORD PTR [rsp], ymm4
+ vmovdqu YMMWORD PTR [rsp+32], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm4
+ vmovdqu YMMWORD PTR [rsp+96], ymm5
+ ; msg_sched: 0-3
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+4]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+8]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+12]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 0-3
+ ; msg_sched: 8-11
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+32]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+36]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+40]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+44]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 8-11
+ ; msg_sched: 16-19
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+64]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+68]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+72]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+76]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 16-19
+ ; msg_sched: 24-27
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+96]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+100]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+104]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+108]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 24-27
+ ; set_w_k_xfer_4: 4
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+160]
+ vmovdqu YMMWORD PTR [rsp+128], ymm4
+ vmovdqu YMMWORD PTR [rsp+160], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+224]
+ vmovdqu YMMWORD PTR [rsp+192], ymm4
+ vmovdqu YMMWORD PTR [rsp+224], ymm5
+ ; msg_sched: 32-35
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+128]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+132]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+136]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+140]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 32-35
+ ; msg_sched: 40-43
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+160]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+164]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+168]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+172]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 40-43
+ ; msg_sched: 48-51
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+192]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+196]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+200]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+204]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 48-51
+ ; msg_sched: 56-59
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+224]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+228]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+232]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+236]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 56-59
+ ; set_w_k_xfer_4: 8
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+288]
+ vmovdqu YMMWORD PTR [rsp+256], ymm4
+ vmovdqu YMMWORD PTR [rsp+288], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+352]
+ vmovdqu YMMWORD PTR [rsp+320], ymm4
+ vmovdqu YMMWORD PTR [rsp+352], ymm5
+ ; msg_sched: 64-67
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm1, ymm0, 4
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+256]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+260]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm3, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+264]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+268]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm0, ymm9, ymm4
+ ; msg_sched done: 64-67
+ ; msg_sched: 72-75
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm2, ymm1, 4
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+288]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+292]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm0, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+296]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+300]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm1, ymm9, ymm4
+ ; msg_sched done: 72-75
+ ; msg_sched: 80-83
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm3, ymm2, 4
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 1 - 2
+ mov eax, r9d
+ mov ecx, r13d
+ add r15d, DWORD PTR [rsp+320]
+ xor ecx, r14d
+ xor edx, r12d
+ and ecx, r12d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r14d
+ xor edx, r12d
+ add r15d, ecx
+ ror edx, 6
+ xor eax, r8d
+ add r15d, edx
+ mov ecx, r8d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r8d
+ mov ecx, r12d
+ add r14d, DWORD PTR [rsp+324]
+ xor ecx, r13d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r11d
+ and ecx, r11d
+ ror edx, 5
+ xor ecx, r13d
+ xor edx, r11d
+ add r14d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm1, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r15d
+ add r14d, edx
+ mov ecx, r15d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r15d
+ xor eax, r8d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 1 - 3
+ mov eax, r15d
+ mov ecx, r11d
+ add r13d, DWORD PTR [rsp+328]
+ xor ecx, r12d
+ xor edx, r10d
+ and ecx, r10d
+ ror edx, 5
+ xor ecx, r12d
+ xor edx, r10d
+ add r13d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r14d
+ add r13d, edx
+ mov ecx, r14d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r14d
+ xor ebx, r15d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r14d
+ mov ecx, r10d
+ add r12d, DWORD PTR [rsp+332]
+ xor ecx, r11d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r9d
+ and ecx, r9d
+ ror edx, 5
+ xor ecx, r11d
+ xor edx, r9d
+ add r12d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r13d
+ add r12d, edx
+ mov ecx, r13d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r13d
+ xor eax, r14d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ vpaddd ymm2, ymm9, ymm4
+ ; msg_sched done: 80-83
+ ; msg_sched: 88-91
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpalignr ymm5, ymm0, ymm3, 4
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 1 - 2
+ mov eax, r13d
+ mov ecx, r9d
+ add r11d, DWORD PTR [rsp+352]
+ xor ecx, r10d
+ xor edx, r8d
+ and ecx, r8d
+ vpsrld ymm6, ymm5, 7
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 3 - 4
+ ror edx, 5
+ xor ecx, r10d
+ xor edx, r8d
+ add r11d, ecx
+ ror edx, 6
+ xor eax, r12d
+ add r11d, edx
+ mov ecx, r12d
+ vpsrld ymm8, ymm5, 18
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 5 - 6
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ vpor ymm6, ymm7, ymm6
+ vpor ymm8, ymm9, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ ; rnd_1: 0 - 1
+ ror edx, 14
+ mov ebx, r12d
+ mov ecx, r8d
+ add r10d, DWORD PTR [rsp+356]
+ xor ecx, r9d
+ vpsrld ymm9, ymm5, 3
+ vpxor ymm6, ymm8, ymm6
+ ; rnd_1: 2 - 3
+ xor edx, r15d
+ and ecx, r15d
+ ror edx, 5
+ xor ecx, r9d
+ xor edx, r15d
+ add r10d, ecx
+ vpxor ymm5, ymm9, ymm6
+ vpshufd ymm6, ymm2, 250
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r11d
+ add r10d, edx
+ mov ecx, r11d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r11d
+ xor eax, r12d
+ vpsrld ymm8, ymm6, 10
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 6 - 7
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ ; rnd_0: 0 - 0
+ ror edx, 14
+ vpsrlq ymm6, ymm6, 17
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 1 - 3
+ mov eax, r11d
+ mov ecx, r15d
+ add r9d, DWORD PTR [rsp+360]
+ xor ecx, r8d
+ xor edx, r14d
+ and ecx, r14d
+ ror edx, 5
+ xor ecx, r8d
+ xor edx, r14d
+ add r9d, ecx
+ vpxor ymm6, ymm7, ymm6
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 4 - 4
+ ror edx, 6
+ xor eax, r10d
+ add r9d, edx
+ mov ecx, r10d
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 5 - 5
+ and ebx, eax
+ ror ecx, 9
+ xor ecx, r10d
+ xor ebx, r11d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 6 - 6
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 7 - 7
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ ; rnd_1: 0 - 0
+ ror edx, 14
+ vpshufd ymm6, ymm4, 80
+ ; rnd_1: 1 - 1
+ mov ebx, r10d
+ mov ecx, r14d
+ add r8d, DWORD PTR [rsp+364]
+ xor ecx, r15d
+ vpsrlq ymm8, ymm6, 17
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 2 - 3
+ xor edx, r13d
+ and ecx, r13d
+ ror edx, 5
+ xor ecx, r15d
+ xor edx, r13d
+ add r8d, ecx
+ vpsrld ymm9, ymm6, 10
+ vpxor ymm8, ymm7, ymm8
+ ; rnd_1: 4 - 5
+ ror edx, 6
+ xor ebx, r9d
+ add r8d, edx
+ mov ecx, r9d
+ and eax, ebx
+ ror ecx, 9
+ xor ecx, r9d
+ xor eax, r10d
+ vpxor ymm9, ymm8, ymm9
+ ; rnd_1: 6 - 6
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 7 - 7
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ vpaddd ymm3, ymm9, ymm4
+ ; msg_sched done: 88-91
+ ; set_w_k_xfer_4: 12
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+416]
+ vmovdqu YMMWORD PTR [rsp+384], ymm4
+ vmovdqu YMMWORD PTR [rsp+416], ymm5
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+480]
+ vmovdqu YMMWORD PTR [rsp+448], ymm4
+ vmovdqu YMMWORD PTR [rsp+480], ymm5
+ ; rnd_all_4: 24-27
+ add r15d, DWORD PTR [rsp+384]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+388]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+392]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+396]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 26-29
+ add r11d, DWORD PTR [rsp+416]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+420]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+424]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+428]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 28-31
+ add r15d, DWORD PTR [rsp+448]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+452]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+456]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+460]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 30-33
+ add r11d, DWORD PTR [rsp+480]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+484]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+488]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+492]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ mov ebx, r9d
+ mov edx, r12d
+ xor ebx, r10d
+ ; rnd_all_4: 1-4
+ add r15d, DWORD PTR [rsp+16]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+20]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+24]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+28]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 3-6
+ add r11d, DWORD PTR [rsp+48]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+52]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+56]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+60]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 5-8
+ add r15d, DWORD PTR [rsp+80]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+84]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+88]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+92]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 7-10
+ add r11d, DWORD PTR [rsp+112]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+116]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+120]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+124]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 9-12
+ add r15d, DWORD PTR [rsp+144]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+148]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+152]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+156]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 11-14
+ add r11d, DWORD PTR [rsp+176]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+180]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+184]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+188]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 13-16
+ add r15d, DWORD PTR [rsp+208]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+212]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+216]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+220]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 15-18
+ add r11d, DWORD PTR [rsp+240]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+244]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+248]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+252]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 17-20
+ add r15d, DWORD PTR [rsp+272]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+276]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+280]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+284]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 19-22
+ add r11d, DWORD PTR [rsp+304]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+308]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+312]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+316]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 21-24
+ add r15d, DWORD PTR [rsp+336]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+340]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+344]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+348]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 23-26
+ add r11d, DWORD PTR [rsp+368]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+372]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+376]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+380]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 25-28
+ add r15d, DWORD PTR [rsp+400]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+404]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+408]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+412]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 27-30
+ add r11d, DWORD PTR [rsp+432]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+436]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+440]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+444]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ ; rnd_all_4: 29-32
+ add r15d, DWORD PTR [rsp+464]
+ mov ecx, r13d
+ mov eax, r9d
+ xor ecx, r14d
+ ror edx, 14
+ and ecx, r12d
+ xor edx, r12d
+ xor ecx, r14d
+ ror edx, 5
+ add r15d, ecx
+ xor edx, r12d
+ xor eax, r8d
+ ror edx, 6
+ mov ecx, r8d
+ add r15d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r8d
+ xor ebx, r9d
+ ror ecx, 11
+ add r11d, r15d
+ xor ecx, r8d
+ add r15d, ebx
+ ror ecx, 2
+ mov edx, r11d
+ add r15d, ecx
+ add r14d, DWORD PTR [rsp+468]
+ mov ecx, r12d
+ mov ebx, r8d
+ xor ecx, r13d
+ ror edx, 14
+ and ecx, r11d
+ xor edx, r11d
+ xor ecx, r13d
+ ror edx, 5
+ add r14d, ecx
+ xor edx, r11d
+ xor ebx, r15d
+ ror edx, 6
+ mov ecx, r15d
+ add r14d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r15d
+ xor eax, r8d
+ ror ecx, 11
+ add r10d, r14d
+ xor ecx, r15d
+ add r14d, eax
+ ror ecx, 2
+ mov edx, r10d
+ add r14d, ecx
+ add r13d, DWORD PTR [rsp+472]
+ mov ecx, r11d
+ mov eax, r15d
+ xor ecx, r12d
+ ror edx, 14
+ and ecx, r10d
+ xor edx, r10d
+ xor ecx, r12d
+ ror edx, 5
+ add r13d, ecx
+ xor edx, r10d
+ xor eax, r14d
+ ror edx, 6
+ mov ecx, r14d
+ add r13d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r14d
+ xor ebx, r15d
+ ror ecx, 11
+ add r9d, r13d
+ xor ecx, r14d
+ add r13d, ebx
+ ror ecx, 2
+ mov edx, r9d
+ add r13d, ecx
+ add r12d, DWORD PTR [rsp+476]
+ mov ecx, r10d
+ mov ebx, r14d
+ xor ecx, r11d
+ ror edx, 14
+ and ecx, r9d
+ xor edx, r9d
+ xor ecx, r11d
+ ror edx, 5
+ add r12d, ecx
+ xor edx, r9d
+ xor ebx, r13d
+ ror edx, 6
+ mov ecx, r13d
+ add r12d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r13d
+ xor eax, r14d
+ ror ecx, 11
+ add r8d, r12d
+ xor ecx, r13d
+ add r12d, eax
+ ror ecx, 2
+ mov edx, r8d
+ add r12d, ecx
+ ; rnd_all_4: 31-34
+ add r11d, DWORD PTR [rsp+496]
+ mov ecx, r9d
+ mov eax, r13d
+ xor ecx, r10d
+ ror edx, 14
+ and ecx, r8d
+ xor edx, r8d
+ xor ecx, r10d
+ ror edx, 5
+ add r11d, ecx
+ xor edx, r8d
+ xor eax, r12d
+ ror edx, 6
+ mov ecx, r12d
+ add r11d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r12d
+ xor ebx, r13d
+ ror ecx, 11
+ add r15d, r11d
+ xor ecx, r12d
+ add r11d, ebx
+ ror ecx, 2
+ mov edx, r15d
+ add r11d, ecx
+ add r10d, DWORD PTR [rsp+500]
+ mov ecx, r8d
+ mov ebx, r12d
+ xor ecx, r9d
+ ror edx, 14
+ and ecx, r15d
+ xor edx, r15d
+ xor ecx, r9d
+ ror edx, 5
+ add r10d, ecx
+ xor edx, r15d
+ xor ebx, r11d
+ ror edx, 6
+ mov ecx, r11d
+ add r10d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r11d
+ xor eax, r12d
+ ror ecx, 11
+ add r14d, r10d
+ xor ecx, r11d
+ add r10d, eax
+ ror ecx, 2
+ mov edx, r14d
+ add r10d, ecx
+ add r9d, DWORD PTR [rsp+504]
+ mov ecx, r15d
+ mov eax, r11d
+ xor ecx, r8d
+ ror edx, 14
+ and ecx, r14d
+ xor edx, r14d
+ xor ecx, r8d
+ ror edx, 5
+ add r9d, ecx
+ xor edx, r14d
+ xor eax, r10d
+ ror edx, 6
+ mov ecx, r10d
+ add r9d, edx
+ ror ecx, 9
+ and ebx, eax
+ xor ecx, r10d
+ xor ebx, r11d
+ ror ecx, 11
+ add r13d, r9d
+ xor ecx, r10d
+ add r9d, ebx
+ ror ecx, 2
+ mov edx, r13d
+ add r9d, ecx
+ add r8d, DWORD PTR [rsp+508]
+ mov ecx, r14d
+ mov ebx, r10d
+ xor ecx, r15d
+ ror edx, 14
+ and ecx, r13d
+ xor edx, r13d
+ xor ecx, r15d
+ ror edx, 5
+ add r8d, ecx
+ xor edx, r13d
+ xor ebx, r9d
+ ror edx, 6
+ mov ecx, r9d
+ add r8d, edx
+ ror ecx, 9
+ and eax, ebx
+ xor ecx, r9d
+ xor eax, r10d
+ ror ecx, 11
+ add r12d, r8d
+ xor ecx, r9d
+ add r8d, eax
+ ror ecx, 2
+ mov edx, r12d
+ add r8d, ecx
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ add rsi, 128
+ sub DWORD PTR [rsp+512], 128
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ jnz L_sha256_len_avx2_start
+L_sha256_len_avx2_done:
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+512]
+ vmovdqu xmm7, OWORD PTR [rsp+528]
+ vmovdqu xmm8, OWORD PTR [rsp+544]
+ vmovdqu xmm9, OWORD PTR [rsp+560]
+ vmovdqu xmm10, OWORD PTR [rsp+576]
+ vmovdqu xmm11, OWORD PTR [rsp+592]
+ vmovdqu xmm12, OWORD PTR [rsp+608]
+ vmovdqu xmm13, OWORD PTR [rsp+624]
+ add rsp, 644
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX2_Len ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h
+ptr_L_avx2_rorx_sha256_k QWORD L_avx2_rorx_sha256_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ QWORD 0405060700010203h, 0c0d0e0f08090a0bh
+ptr_L_avx2_rorx_sha256_flip_mask QWORD L_avx2_rorx_sha256_flip_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ptr_L_avx2_rorx_sha256_shuf_00BA QWORD L_avx2_rorx_sha256_shuf_00BA
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ QWORD 0ffffffffffffffffh, 0b0a090803020100h
+ptr_L_avx2_rorx_sha256_shuf_DC00 QWORD L_avx2_rorx_sha256_shuf_DC00
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX2_RORX PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbp
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ sub rsp, 640
+ vmovdqu OWORD PTR [rsp+512], xmm6
+ vmovdqu OWORD PTR [rsp+528], xmm7
+ vmovdqu OWORD PTR [rsp+544], xmm8
+ vmovdqu OWORD PTR [rsp+560], xmm9
+ vmovdqu OWORD PTR [rsp+576], xmm10
+ vmovdqu OWORD PTR [rsp+592], xmm11
+ vmovdqu OWORD PTR [rsp+608], xmm12
+ vmovdqu OWORD PTR [rsp+624], xmm13
+ mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha256_k]
+ vmovdqa xmm13, OWORD PTR L_avx2_rorx_sha256_flip_mask
+ vmovdqu ymm11, YMMWORD PTR L_avx2_rorx_sha256_shuf_00BA
+ vmovdqu ymm12, YMMWORD PTR L_avx2_rorx_sha256_shuf_DC00
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm13
+ vpshufb xmm1, xmm1, xmm13
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32]
+ vmovdqu YMMWORD PTR [rsp], ymm4
+ vmovdqu YMMWORD PTR [rsp+32], ymm5
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm13
+ vpshufb xmm3, xmm3, xmm13
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm4
+ vmovdqu YMMWORD PTR [rsp+96], ymm5
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ mov ebx, r9d
+ rorx edx, r12d, 6
+ xor ebx, r10d
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+128], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+32]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+36]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+40]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+44]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+160]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+160], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+64]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+68]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+72]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+76]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+192], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+96]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+100]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+104]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+108]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+224]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+224], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+128]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+132]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+136]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+140]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+256], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+160]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+164]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+168]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+172]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+288]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+288], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+192]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+196]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+200]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+204]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+320], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+224]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+228]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+232]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+236]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+352]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+352], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+256]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+260]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+264]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+268]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+384], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+288]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+292]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+296]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+300]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+416]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+416], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+320]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+324]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+328]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+332]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+448], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+352]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+356]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+360]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+364]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+480]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+480], ymm4
+ xor eax, eax
+ xor ecx, ecx
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+384]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+388]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+392]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+396]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+416]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+420]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+424]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+428]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+448]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+452]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+456]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+460]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+480]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+484]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+488]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+492]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ add r8d, eax
+ add DWORD PTR [rdi], r8d
+ add DWORD PTR [rdi+4], r9d
+ add DWORD PTR [rdi+8], r10d
+ add DWORD PTR [rdi+12], r11d
+ add DWORD PTR [rdi+16], r12d
+ add DWORD PTR [rdi+20], r13d
+ add DWORD PTR [rdi+24], r14d
+ add DWORD PTR [rdi+28], r15d
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+512]
+ vmovdqu xmm7, OWORD PTR [rsp+528]
+ vmovdqu xmm8, OWORD PTR [rsp+544]
+ vmovdqu xmm9, OWORD PTR [rsp+560]
+ vmovdqu xmm10, OWORD PTR [rsp+576]
+ vmovdqu xmm11, OWORD PTR [rsp+592]
+ vmovdqu xmm12, OWORD PTR [rsp+608]
+ vmovdqu xmm13, OWORD PTR [rsp+624]
+ add rsp, 640
+ pop rsi
+ pop rdi
+ pop rbp
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX2_RORX ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha256_AVX2_RORX_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rbp, r8
+ sub rsp, 644
+ vmovdqu OWORD PTR [rsp+512], xmm6
+ vmovdqu OWORD PTR [rsp+528], xmm7
+ vmovdqu OWORD PTR [rsp+544], xmm8
+ vmovdqu OWORD PTR [rsp+560], xmm9
+ vmovdqu OWORD PTR [rsp+576], xmm10
+ vmovdqu OWORD PTR [rsp+592], xmm11
+ vmovdqu OWORD PTR [rsp+608], xmm12
+ vmovdqu OWORD PTR [rsp+624], xmm13
+ test bpl, 64
+ mov DWORD PTR [rsp+512], ebp
+ je L_sha256_len_avx2_rorx_block
+ vmovdqu ymm0, YMMWORD PTR [rsi]
+ vmovdqu ymm1, YMMWORD PTR [rsi+32]
+ vmovups YMMWORD PTR [rdi+32], ymm0
+ vmovups YMMWORD PTR [rdi+64], ymm1
+ call Transform_Sha256_AVX2_RORX
+ add rsi, 64
+ sub DWORD PTR [rsp+512], 64
+ jz L_sha256_len_avx2_rorx_done
+L_sha256_len_avx2_rorx_block:
+ mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha256_k]
+ vmovdqu ymm13, YMMWORD PTR L_avx2_rorx_sha256_flip_mask
+ vmovdqu ymm11, YMMWORD PTR L_avx2_rorx_sha256_shuf_00BA
+ vmovdqu ymm12, YMMWORD PTR L_avx2_rorx_sha256_shuf_DC00
+ mov r8d, DWORD PTR [rdi]
+ mov r9d, DWORD PTR [rdi+4]
+ mov r10d, DWORD PTR [rdi+8]
+ mov r11d, DWORD PTR [rdi+12]
+ mov r12d, DWORD PTR [rdi+16]
+ mov r13d, DWORD PTR [rdi+20]
+ mov r14d, DWORD PTR [rdi+24]
+ mov r15d, DWORD PTR [rdi+28]
+ ; Start of loop processing two blocks
+L_sha256_len_avx2_rorx_start:
+ ; X0, X1, X2, X3 = W[0..15]
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vinserti128 ymm0, ymm0, OWORD PTR [rsi+64], 1
+ vinserti128 ymm1, ymm1, OWORD PTR [rsi+80], 1
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp]
+ vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32]
+ vmovdqu YMMWORD PTR [rsp], ymm4
+ vmovdqu YMMWORD PTR [rsp+32], ymm5
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vinserti128 ymm2, ymm2, OWORD PTR [rsi+96], 1
+ vinserti128 ymm3, ymm3, OWORD PTR [rsi+112], 1
+ vpshufb ymm2, ymm2, ymm13
+ vpshufb ymm3, ymm3, ymm13
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64]
+ vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm4
+ vmovdqu YMMWORD PTR [rsp+96], ymm5
+ mov ebx, r9d
+ rorx edx, r12d, 6
+ xor ebx, r10d
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+4]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+8]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+12]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+128], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+32]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+36]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+40]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+44]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+160]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+160], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+64]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+68]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+72]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+76]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+192], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+96]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+100]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+104]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+108]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+224]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+224], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+128]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+132]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+136]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+140]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+256], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+160]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+164]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+168]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+172]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+288]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+288], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+192]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+196]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+200]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+204]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+320], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+224]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+228]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+232]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+236]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+352]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+352], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+256]
+ vpalignr ymm5, ymm1, ymm0, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm3, ymm2, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+260]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm3, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm0
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+264]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+268]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm0, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+384], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+288]
+ vpalignr ymm5, ymm2, ymm1, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm0, ymm3, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+292]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm0, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm1
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+296]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+300]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm1, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm1, YMMWORD PTR [rbp+416]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+416], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r13d
+ rorx ecx, r12d, 11
+ add r15d, DWORD PTR [rsp+320]
+ vpalignr ymm5, ymm3, ymm2, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ vpalignr ymm4, ymm1, ymm0, 4
+ ; rnd_0: 2 - 2
+ and eax, r12d
+ xor edx, ecx
+ rorx ecx, r8d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r15d, edx
+ rorx edx, r8d, 2
+ xor eax, r14d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r8d
+ add r15d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ add r15d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r12d
+ rorx ecx, r11d, 11
+ add r14d, DWORD PTR [rsp+324]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r11d
+ xor edx, ecx
+ rorx ecx, r15d, 13
+ vpshufd ymm7, ymm1, 250
+ ; rnd_1: 3 - 3
+ add r14d, edx
+ rorx edx, r15d, 2
+ xor ebx, r13d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r10d, r14d
+ mov ebx, r8d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r15d
+ add r14d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r8d
+ rorx edx, r10d, 6
+ add r14d, eax
+ vpaddd ymm4, ymm4, ymm2
+ ; rnd_0: 0 - 0
+ mov eax, r11d
+ rorx ecx, r10d, 11
+ add r13d, DWORD PTR [rsp+328]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r10d
+ xor edx, ecx
+ rorx ecx, r14d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r13d, edx
+ rorx edx, r14d, 2
+ xor eax, r12d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r14d
+ add r13d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ add r13d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r10d
+ rorx ecx, r9d, 11
+ add r12d, DWORD PTR [rsp+332]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r9d
+ xor edx, ecx
+ rorx ecx, r13d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r12d, edx
+ rorx edx, r13d, 2
+ xor ebx, r11d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ vpaddd ymm2, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r8d, r12d
+ mov ebx, r14d
+ vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448]
+ ; rnd_1: 6 - 6
+ xor ebx, r13d
+ add r12d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r14d
+ rorx edx, r8d, 6
+ add r12d, eax
+ vmovdqu YMMWORD PTR [rsp+448], ymm4
+ ; rnd_0: 0 - 0
+ mov eax, r9d
+ rorx ecx, r8d, 11
+ add r11d, DWORD PTR [rsp+352]
+ vpalignr ymm5, ymm0, ymm3, 4
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ vpalignr ymm4, ymm2, ymm1, 4
+ ; rnd_0: 2 - 2
+ and eax, r8d
+ xor edx, ecx
+ rorx ecx, r12d, 13
+ vpsrld ymm6, ymm5, 7
+ ; rnd_0: 3 - 3
+ add r11d, edx
+ rorx edx, r12d, 2
+ xor eax, r10d
+ vpslld ymm7, ymm5, 25
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ vpsrld ymm8, ymm5, 18
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ vpslld ymm9, ymm5, 14
+ ; rnd_0: 6 - 6
+ xor eax, r12d
+ add r11d, edx
+ and ebx, eax
+ vpor ymm6, ymm6, ymm7
+ ; rnd_0: 7 - 7
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ add r11d, ebx
+ vpor ymm8, ymm8, ymm9
+ ; rnd_1: 0 - 0
+ mov ebx, r8d
+ rorx ecx, r15d, 11
+ add r10d, DWORD PTR [rsp+356]
+ vpsrld ymm9, ymm5, 3
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ vpxor ymm6, ymm6, ymm8
+ ; rnd_1: 2 - 2
+ and ebx, r15d
+ xor edx, ecx
+ rorx ecx, r11d, 13
+ vpshufd ymm7, ymm2, 250
+ ; rnd_1: 3 - 3
+ add r10d, edx
+ rorx edx, r11d, 2
+ xor ebx, r9d
+ vpxor ymm5, ymm9, ymm6
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ vpsrld ymm8, ymm7, 10
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r14d, r10d
+ mov ebx, r12d
+ vpsrlq ymm6, ymm7, 19
+ ; rnd_1: 6 - 6
+ xor ebx, r11d
+ add r10d, edx
+ and eax, ebx
+ vpsrlq ymm7, ymm7, 17
+ ; rnd_1: 7 - 7
+ xor eax, r12d
+ rorx edx, r14d, 6
+ add r10d, eax
+ vpaddd ymm4, ymm4, ymm3
+ ; rnd_0: 0 - 0
+ mov eax, r15d
+ rorx ecx, r14d, 11
+ add r9d, DWORD PTR [rsp+360]
+ vpxor ymm6, ymm6, ymm7
+ ; rnd_0: 1 - 1
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ vpxor ymm8, ymm8, ymm6
+ ; rnd_0: 2 - 2
+ and eax, r14d
+ xor edx, ecx
+ rorx ecx, r10d, 13
+ vpaddd ymm4, ymm4, ymm5
+ ; rnd_0: 3 - 3
+ add r9d, edx
+ rorx edx, r10d, 2
+ xor eax, r8d
+ vpshufb ymm8, ymm8, ymm11
+ ; rnd_0: 4 - 4
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ vpaddd ymm4, ymm4, ymm8
+ ; rnd_0: 5 - 5
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ vpshufd ymm6, ymm4, 80
+ ; rnd_0: 6 - 6
+ xor eax, r10d
+ add r9d, edx
+ and ebx, eax
+ vpsrlq ymm8, ymm6, 17
+ ; rnd_0: 7 - 7
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ add r9d, ebx
+ vpsrlq ymm7, ymm6, 19
+ ; rnd_1: 0 - 0
+ mov ebx, r14d
+ rorx ecx, r13d, 11
+ add r8d, DWORD PTR [rsp+364]
+ vpsrld ymm9, ymm6, 10
+ ; rnd_1: 1 - 1
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ vpxor ymm8, ymm8, ymm7
+ ; rnd_1: 2 - 2
+ and ebx, r13d
+ xor edx, ecx
+ rorx ecx, r9d, 13
+ vpxor ymm9, ymm9, ymm8
+ ; rnd_1: 3 - 3
+ add r8d, edx
+ rorx edx, r9d, 2
+ xor ebx, r15d
+ vpshufb ymm9, ymm9, ymm12
+ ; rnd_1: 4 - 4
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ vpaddd ymm3, ymm9, ymm4
+ ; rnd_1: 5 - 5
+ xor edx, ecx
+ add r12d, r8d
+ mov ebx, r10d
+ vpaddd ymm4, ymm3, YMMWORD PTR [rbp+480]
+ ; rnd_1: 6 - 6
+ xor ebx, r9d
+ add r8d, edx
+ and eax, ebx
+ ; rnd_1: 7 - 7
+ xor eax, r10d
+ rorx edx, r12d, 6
+ add r8d, eax
+ vmovdqu YMMWORD PTR [rsp+480], ymm4
+ xor eax, eax
+ xor ecx, ecx
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+384]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+388]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+392]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+396]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+416]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+420]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+424]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+428]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+448]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+452]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+456]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+460]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+480]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+484]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+488]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+492]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ add r8d, eax
+ xor ecx, ecx
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ mov ebx, r9d
+ xor eax, eax
+ xor ebx, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+16]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+20]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+24]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+28]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+48]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+52]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+56]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+60]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+80]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+84]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+88]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+92]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+112]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+116]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+120]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+124]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+144]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+148]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+152]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+156]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+176]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+180]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+184]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+188]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+208]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+212]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+216]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+220]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+240]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+244]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+248]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+252]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+272]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+276]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+280]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+284]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+304]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+308]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+312]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+316]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+336]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+340]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+344]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+348]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+368]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+372]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+376]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+380]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+400]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+404]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+408]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+412]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+432]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+436]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+440]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+444]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ rorx edx, r12d, 6
+ rorx ecx, r12d, 11
+ lea r8d, DWORD PTR [r8+rax]
+ add r15d, DWORD PTR [rsp+464]
+ mov eax, r13d
+ xor ecx, edx
+ xor eax, r14d
+ rorx edx, r12d, 25
+ xor edx, ecx
+ and eax, r12d
+ add r15d, edx
+ rorx edx, r8d, 2
+ rorx ecx, r8d, 13
+ xor eax, r14d
+ xor ecx, edx
+ rorx edx, r8d, 22
+ add r15d, eax
+ xor edx, ecx
+ mov eax, r9d
+ add r11d, r15d
+ xor eax, r8d
+ and ebx, eax
+ add r15d, edx
+ xor ebx, r9d
+ rorx edx, r11d, 6
+ rorx ecx, r11d, 11
+ add r15d, ebx
+ add r14d, DWORD PTR [rsp+468]
+ mov ebx, r12d
+ xor ecx, edx
+ xor ebx, r13d
+ rorx edx, r11d, 25
+ xor edx, ecx
+ and ebx, r11d
+ add r14d, edx
+ rorx edx, r15d, 2
+ rorx ecx, r15d, 13
+ xor ebx, r13d
+ xor ecx, edx
+ rorx edx, r15d, 22
+ add r14d, ebx
+ xor edx, ecx
+ mov ebx, r8d
+ lea r10d, DWORD PTR [r10+r14]
+ xor ebx, r15d
+ and eax, ebx
+ add r14d, edx
+ xor eax, r8d
+ rorx edx, r10d, 6
+ rorx ecx, r10d, 11
+ lea r14d, DWORD PTR [r14+rax]
+ add r13d, DWORD PTR [rsp+472]
+ mov eax, r11d
+ xor ecx, edx
+ xor eax, r12d
+ rorx edx, r10d, 25
+ xor edx, ecx
+ and eax, r10d
+ add r13d, edx
+ rorx edx, r14d, 2
+ rorx ecx, r14d, 13
+ xor eax, r12d
+ xor ecx, edx
+ rorx edx, r14d, 22
+ add r13d, eax
+ xor edx, ecx
+ mov eax, r15d
+ add r9d, r13d
+ xor eax, r14d
+ and ebx, eax
+ add r13d, edx
+ xor ebx, r15d
+ rorx edx, r9d, 6
+ rorx ecx, r9d, 11
+ add r13d, ebx
+ add r12d, DWORD PTR [rsp+476]
+ mov ebx, r10d
+ xor ecx, edx
+ xor ebx, r11d
+ rorx edx, r9d, 25
+ xor edx, ecx
+ and ebx, r9d
+ add r12d, edx
+ rorx edx, r13d, 2
+ rorx ecx, r13d, 13
+ xor ebx, r11d
+ xor ecx, edx
+ rorx edx, r13d, 22
+ add r12d, ebx
+ xor edx, ecx
+ mov ebx, r14d
+ lea r8d, DWORD PTR [r8+r12]
+ xor ebx, r13d
+ and eax, ebx
+ add r12d, edx
+ xor eax, r14d
+ rorx edx, r8d, 6
+ rorx ecx, r8d, 11
+ lea r12d, DWORD PTR [r12+rax]
+ add r11d, DWORD PTR [rsp+496]
+ mov eax, r9d
+ xor ecx, edx
+ xor eax, r10d
+ rorx edx, r8d, 25
+ xor edx, ecx
+ and eax, r8d
+ add r11d, edx
+ rorx edx, r12d, 2
+ rorx ecx, r12d, 13
+ xor eax, r10d
+ xor ecx, edx
+ rorx edx, r12d, 22
+ add r11d, eax
+ xor edx, ecx
+ mov eax, r13d
+ add r15d, r11d
+ xor eax, r12d
+ and ebx, eax
+ add r11d, edx
+ xor ebx, r13d
+ rorx edx, r15d, 6
+ rorx ecx, r15d, 11
+ add r11d, ebx
+ add r10d, DWORD PTR [rsp+500]
+ mov ebx, r8d
+ xor ecx, edx
+ xor ebx, r9d
+ rorx edx, r15d, 25
+ xor edx, ecx
+ and ebx, r15d
+ add r10d, edx
+ rorx edx, r11d, 2
+ rorx ecx, r11d, 13
+ xor ebx, r9d
+ xor ecx, edx
+ rorx edx, r11d, 22
+ add r10d, ebx
+ xor edx, ecx
+ mov ebx, r12d
+ lea r14d, DWORD PTR [r14+r10]
+ xor ebx, r11d
+ and eax, ebx
+ add r10d, edx
+ xor eax, r12d
+ rorx edx, r14d, 6
+ rorx ecx, r14d, 11
+ lea r10d, DWORD PTR [r10+rax]
+ add r9d, DWORD PTR [rsp+504]
+ mov eax, r15d
+ xor ecx, edx
+ xor eax, r8d
+ rorx edx, r14d, 25
+ xor edx, ecx
+ and eax, r14d
+ add r9d, edx
+ rorx edx, r10d, 2
+ rorx ecx, r10d, 13
+ xor eax, r8d
+ xor ecx, edx
+ rorx edx, r10d, 22
+ add r9d, eax
+ xor edx, ecx
+ mov eax, r11d
+ add r13d, r9d
+ xor eax, r10d
+ and ebx, eax
+ add r9d, edx
+ xor ebx, r11d
+ rorx edx, r13d, 6
+ rorx ecx, r13d, 11
+ add r9d, ebx
+ add r8d, DWORD PTR [rsp+508]
+ mov ebx, r14d
+ xor ecx, edx
+ xor ebx, r15d
+ rorx edx, r13d, 25
+ xor edx, ecx
+ and ebx, r13d
+ add r8d, edx
+ rorx edx, r9d, 2
+ rorx ecx, r9d, 13
+ xor ebx, r15d
+ xor ecx, edx
+ rorx edx, r9d, 22
+ add r8d, ebx
+ xor edx, ecx
+ mov ebx, r10d
+ lea r12d, DWORD PTR [r12+r8]
+ xor ebx, r9d
+ and eax, ebx
+ add r8d, edx
+ xor eax, r10d
+ add r8d, eax
+ add rsi, 128
+ add r8d, DWORD PTR [rdi]
+ add r9d, DWORD PTR [rdi+4]
+ add r10d, DWORD PTR [rdi+8]
+ add r11d, DWORD PTR [rdi+12]
+ add r12d, DWORD PTR [rdi+16]
+ add r13d, DWORD PTR [rdi+20]
+ add r14d, DWORD PTR [rdi+24]
+ add r15d, DWORD PTR [rdi+28]
+ sub DWORD PTR [rsp+512], 128
+ mov DWORD PTR [rdi], r8d
+ mov DWORD PTR [rdi+4], r9d
+ mov DWORD PTR [rdi+8], r10d
+ mov DWORD PTR [rdi+12], r11d
+ mov DWORD PTR [rdi+16], r12d
+ mov DWORD PTR [rdi+20], r13d
+ mov DWORD PTR [rdi+24], r14d
+ mov DWORD PTR [rdi+28], r15d
+ jnz L_sha256_len_avx2_rorx_start
+L_sha256_len_avx2_rorx_done:
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+512]
+ vmovdqu xmm7, OWORD PTR [rsp+528]
+ vmovdqu xmm8, OWORD PTR [rsp+544]
+ vmovdqu xmm9, OWORD PTR [rsp+560]
+ vmovdqu xmm10, OWORD PTR [rsp+576]
+ vmovdqu xmm11, OWORD PTR [rsp+592]
+ vmovdqu xmm12, OWORD PTR [rsp+608]
+ vmovdqu xmm13, OWORD PTR [rsp+624]
+ add rsp, 644
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha256_AVX2_RORX_Len ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfcrypt/src/sha3_asm.asm b/wolfcrypt/src/sha3_asm.asm
new file mode 100644
index 00000000000..8f4db30ff57
--- /dev/null
+++ b/wolfcrypt/src/sha3_asm.asm
@@ -0,0 +1,31448 @@
+; /* sha3_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+_DATA SEGMENT
+ALIGN 16
+L_sha3_avx2_r QWORD 0000000000000001h, 0000000000000001h
+ QWORD 0000000000000001h, 0000000000000001h
+ QWORD 0000000000008082h, 0000000000008082h
+ QWORD 0000000000008082h, 0000000000008082h
+ QWORD 800000000000808ah, 800000000000808ah
+ QWORD 800000000000808ah, 800000000000808ah
+ QWORD 8000000080008000h, 8000000080008000h
+ QWORD 8000000080008000h, 8000000080008000h
+ QWORD 000000000000808bh, 000000000000808bh
+ QWORD 000000000000808bh, 000000000000808bh
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000000008009h, 8000000000008009h
+ QWORD 8000000000008009h, 8000000000008009h
+ QWORD 000000000000008ah, 000000000000008ah
+ QWORD 000000000000008ah, 000000000000008ah
+ QWORD 0000000000000088h, 0000000000000088h
+ QWORD 0000000000000088h, 0000000000000088h
+ QWORD 0000000080008009h, 0000000080008009h
+ QWORD 0000000080008009h, 0000000080008009h
+ QWORD 000000008000000ah, 000000008000000ah
+ QWORD 000000008000000ah, 000000008000000ah
+ QWORD 000000008000808bh, 000000008000808bh
+ QWORD 000000008000808bh, 000000008000808bh
+ QWORD 800000000000008bh, 800000000000008bh
+ QWORD 800000000000008bh, 800000000000008bh
+ QWORD 8000000000008089h, 8000000000008089h
+ QWORD 8000000000008089h, 8000000000008089h
+ QWORD 8000000000008003h, 8000000000008003h
+ QWORD 8000000000008003h, 8000000000008003h
+ QWORD 8000000000008002h, 8000000000008002h
+ QWORD 8000000000008002h, 8000000000008002h
+ QWORD 8000000000000080h, 8000000000000080h
+ QWORD 8000000000000080h, 8000000000000080h
+ QWORD 000000000000800ah, 000000000000800ah
+ QWORD 000000000000800ah, 000000000000800ah
+ QWORD 800000008000000ah, 800000008000000ah
+ QWORD 800000008000000ah, 800000008000000ah
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000000008080h, 8000000000008080h
+ QWORD 8000000000008080h, 8000000000008080h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 8000000080008008h, 8000000080008008h
+ QWORD 8000000080008008h, 8000000080008008h
+ptr_L_sha3_avx2_r QWORD L_sha3_avx2_r
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_x4_avx2_r QWORD 0000000000000001h, 0000000000000001h
+ QWORD 0000000000000001h, 0000000000000001h
+ QWORD 0000000000008082h, 0000000000008082h
+ QWORD 0000000000008082h, 0000000000008082h
+ QWORD 800000000000808ah, 800000000000808ah
+ QWORD 800000000000808ah, 800000000000808ah
+ QWORD 8000000080008000h, 8000000080008000h
+ QWORD 8000000080008000h, 8000000080008000h
+ QWORD 000000000000808bh, 000000000000808bh
+ QWORD 000000000000808bh, 000000000000808bh
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000000008009h, 8000000000008009h
+ QWORD 8000000000008009h, 8000000000008009h
+ QWORD 000000000000008ah, 000000000000008ah
+ QWORD 000000000000008ah, 000000000000008ah
+ QWORD 0000000000000088h, 0000000000000088h
+ QWORD 0000000000000088h, 0000000000000088h
+ QWORD 0000000080008009h, 0000000080008009h
+ QWORD 0000000080008009h, 0000000080008009h
+ QWORD 000000008000000ah, 000000008000000ah
+ QWORD 000000008000000ah, 000000008000000ah
+ QWORD 000000008000808bh, 000000008000808bh
+ QWORD 000000008000808bh, 000000008000808bh
+ QWORD 800000000000008bh, 800000000000008bh
+ QWORD 800000000000008bh, 800000000000008bh
+ QWORD 8000000000008089h, 8000000000008089h
+ QWORD 8000000000008089h, 8000000000008089h
+ QWORD 8000000000008003h, 8000000000008003h
+ QWORD 8000000000008003h, 8000000000008003h
+ QWORD 8000000000008002h, 8000000000008002h
+ QWORD 8000000000008002h, 8000000000008002h
+ QWORD 8000000000000080h, 8000000000000080h
+ QWORD 8000000000000080h, 8000000000000080h
+ QWORD 000000000000800ah, 000000000000800ah
+ QWORD 000000000000800ah, 000000000000800ah
+ QWORD 800000008000000ah, 800000008000000ah
+ QWORD 800000008000000ah, 800000008000000ah
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000080008081h, 8000000080008081h
+ QWORD 8000000000008080h, 8000000000008080h
+ QWORD 8000000000008080h, 8000000000008080h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 0000000080000001h, 0000000080000001h
+ QWORD 8000000080008008h, 8000000080008008h
+ QWORD 8000000080008008h, 8000000080008008h
+ptr_L_sha3_x4_avx2_r QWORD L_sha3_x4_avx2_r
+_DATA ENDS
+IFDEF HAVE_INTEL_AVX2
+_TEXT SEGMENT READONLY PARA
+sha3_block_bmi2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rsi, QWORD PTR [rcx]
+ add rcx, 96
+ ; Round 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-88]
+ mov r13, QWORD PTR [rcx+-80]
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+-64]
+ xor r11, QWORD PTR [rcx+-56]
+ xor r12, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r11, QWORD PTR [rcx+-16]
+ xor r12, QWORD PTR [rcx+-8]
+ xor r13, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor r11, QWORD PTR [rcx+24]
+ xor r12, QWORD PTR [rcx+32]
+ xor r13, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor r11, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-48]
+ mov r13, QWORD PTR [rcx]
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+96]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+48], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+96], r14
+ ; XOR in constant
+ xor rsi, 1
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-72]
+ mov r12, QWORD PTR [rcx+-24]
+ mov r13, QWORD PTR [rcx+-16]
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+80]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+80], rdi
+ mov QWORD PTR [rcx+-72], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-88]
+ mov r12, QWORD PTR [rcx+-40]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+64]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+64], rdi
+ mov QWORD PTR [rcx+-88], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-64]
+ mov r12, QWORD PTR [rcx+-56]
+ mov r13, QWORD PTR [rcx+-8]
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+88]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+88], rdi
+ mov QWORD PTR [rcx+-64], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-80]
+ xor r9, QWORD PTR [rcx+-32]
+ xor r10, QWORD PTR [rcx+16]
+ xor rdx, QWORD PTR [rcx+24]
+ xor rax, QWORD PTR [rcx+72]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-80], r11
+ mov QWORD PTR [rcx+-32], r12
+ mov QWORD PTR [rcx+16], r13
+ mov QWORD PTR [rcx+24], r14
+ mov QWORD PTR [rcx+72], r15
+ ; Round 1
+ xor r11, rsi
+ xor r11, QWORD PTR [rcx+-88]
+ xor r11, QWORD PTR [rcx+-72]
+ xor r11, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-56]
+ xor r12, QWORD PTR [rcx+-48]
+ xor r12, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-24]
+ xor r13, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx+-8]
+ xor r13, QWORD PTR [rcx]
+ xor r13, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+32]
+ xor r14, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-24]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+72]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+40], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+72], r14
+ ; XOR in constant
+ xor rsi, 32898
+ ; Row 1
+ mov r11, QWORD PTR [rcx+48]
+ mov r12, QWORD PTR [rcx+80]
+ mov r13, QWORD PTR [rcx+-88]
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+16]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+16], rdi
+ mov QWORD PTR [rcx+48], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-48]
+ mov r12, QWORD PTR [rcx+-16]
+ mov r13, QWORD PTR [rcx+56]
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+-80]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-48], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+96]
+ mov r12, QWORD PTR [rcx+-72]
+ mov r13, QWORD PTR [rcx+-40]
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+24]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+24], rdi
+ mov QWORD PTR [rcx+96], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx]
+ xor r9, QWORD PTR [rcx+32]
+ xor r10, QWORD PTR [rcx+64]
+ xor rdx, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+-32]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+32], r12
+ mov QWORD PTR [rcx+64], r13
+ mov QWORD PTR [rcx+-64], r14
+ mov QWORD PTR [rcx+-32], r15
+ ; Round 2
+ xor r11, rsi
+ xor r13, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor r12, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-56]
+ xor r11, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-24]
+ xor r12, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx+-8]
+ xor r13, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor r14, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+72]
+ xor r12, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor r11, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+80]
+ mov r13, QWORD PTR [rcx+56]
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+-32]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-8], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-32], r14
+ ; XOR in constant
+ mov r15, 9223372036854808714
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+40]
+ mov r12, QWORD PTR [rcx+16]
+ mov r13, QWORD PTR [rcx+-48]
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+64]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+64], rdi
+ mov QWORD PTR [rcx+40], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-24]
+ mov r12, QWORD PTR [rcx+-88]
+ mov r13, QWORD PTR [rcx+88]
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+-24], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+72]
+ mov r12, QWORD PTR [rcx+48]
+ mov r13, QWORD PTR [rcx+-16]
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+-64]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-64], rdi
+ mov QWORD PTR [rcx+72], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+8]
+ xor r9, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+-80]
+ xor rdx, QWORD PTR [rcx+96]
+ xor rax, QWORD PTR [rcx+32]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+-56], r12
+ mov QWORD PTR [rcx+-80], r13
+ mov QWORD PTR [rcx+96], r14
+ mov QWORD PTR [rcx+32], r15
+ ; Round 3
+ xor r11, rsi
+ xor r12, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-72]
+ xor r15, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r11, QWORD PTR [rcx+-24]
+ xor r13, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor r11, QWORD PTR [rcx+72]
+ xor r12, QWORD PTR [rcx+80]
+ xor r13, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+16]
+ mov r13, QWORD PTR [rcx+88]
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+32]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-40], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+32], r14
+ ; XOR in constant
+ mov r15, 9223372039002292224
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-8]
+ mov r12, QWORD PTR [rcx+64]
+ mov r13, QWORD PTR [rcx+-24]
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+-80]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-8], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+80]
+ mov r12, QWORD PTR [rcx+-48]
+ mov r13, QWORD PTR [rcx+24]
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+8]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+8], rdi
+ mov QWORD PTR [rcx+80], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-32]
+ mov r12, QWORD PTR [rcx+40]
+ mov r13, QWORD PTR [rcx+-88]
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+96]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+96], rdi
+ mov QWORD PTR [rcx+-32], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+56]
+ xor r9, QWORD PTR [rcx+-72]
+ xor r10, QWORD PTR [rcx]
+ xor rdx, QWORD PTR [rcx+72]
+ xor rax, QWORD PTR [rcx+-56]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+56], r11
+ mov QWORD PTR [rcx+-72], r12
+ mov QWORD PTR [rcx], r13
+ mov QWORD PTR [rcx+72], r14
+ mov QWORD PTR [rcx+-56], r15
+ ; Round 4
+ xor r11, rsi
+ xor r13, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r11, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor r11, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor r12, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor r12, QWORD PTR [rcx+64]
+ xor r11, QWORD PTR [rcx+80]
+ xor r13, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+64]
+ mov r13, QWORD PTR [rcx+24]
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+-56]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-16], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-56], r14
+ ; XOR in constant
+ xor rsi, 32907
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-40]
+ mov r12, QWORD PTR [rcx+-80]
+ mov r13, QWORD PTR [rcx+80]
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+-40], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+16]
+ mov r12, QWORD PTR [rcx+-24]
+ mov r13, QWORD PTR [rcx+-64]
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+56]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+56], rdi
+ mov QWORD PTR [rcx+16], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+32]
+ mov r12, QWORD PTR [rcx+-8]
+ mov r13, QWORD PTR [rcx+-48]
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+72]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+72], rdi
+ mov QWORD PTR [rcx+32], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+88]
+ xor r9, QWORD PTR [rcx+48]
+ xor r10, QWORD PTR [rcx+8]
+ xor rdx, QWORD PTR [rcx+-32]
+ xor rax, QWORD PTR [rcx+-72]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+88], r11
+ mov QWORD PTR [rcx+48], r12
+ mov QWORD PTR [rcx+8], r13
+ mov QWORD PTR [rcx+-32], r14
+ mov QWORD PTR [rcx+-72], r15
+ ; Round 5
+ xor r11, rsi
+ xor r14, QWORD PTR [rcx+-88]
+ xor r12, QWORD PTR [rcx+-80]
+ xor r13, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor r13, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor r12, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor r11, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+32]
+ xor r14, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+56]
+ xor r12, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-80]
+ mov r13, QWORD PTR [rcx+-64]
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+-72]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-88], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-72], r14
+ ; XOR in constant
+ mov r15, 2147483649
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-16]
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+16]
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+8]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+8], rdi
+ mov QWORD PTR [rcx+-16], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+64]
+ mov r12, QWORD PTR [rcx+80]
+ mov r13, QWORD PTR [rcx+96]
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+88]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+88], rdi
+ mov QWORD PTR [rcx+64], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-56]
+ mov r12, QWORD PTR [rcx+-40]
+ mov r13, QWORD PTR [rcx+-24]
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+-32]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-32], rdi
+ mov QWORD PTR [rcx+-56], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+24]
+ xor r9, QWORD PTR [rcx+40]
+ xor r10, QWORD PTR [rcx+56]
+ xor rdx, QWORD PTR [rcx+32]
+ xor rax, QWORD PTR [rcx+48]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+24], r11
+ mov QWORD PTR [rcx+40], r12
+ mov QWORD PTR [rcx+56], r13
+ mov QWORD PTR [rcx+32], r14
+ mov QWORD PTR [rcx+48], r15
+ ; Round 6
+ xor r11, rsi
+ xor r14, QWORD PTR [rcx+-88]
+ xor r12, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor r13, QWORD PTR [rcx+-64]
+ xor r11, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-48]
+ xor r12, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor r11, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx+-8]
+ xor r12, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+16]
+ xor r11, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+72]
+ xor r12, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ xor r13, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+96]
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+48]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-48], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+48], r14
+ ; XOR in constant
+ mov r15, 9223372039002292353
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-88]
+ mov r12, QWORD PTR [rcx+8]
+ mov r13, QWORD PTR [rcx+64]
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+56]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+56], rdi
+ mov QWORD PTR [rcx+-88], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-80]
+ mov r12, QWORD PTR [rcx+16]
+ mov r13, QWORD PTR [rcx+72]
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+24]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+24], rdi
+ mov QWORD PTR [rcx+-80], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-72]
+ mov r12, QWORD PTR [rcx+-16]
+ mov r13, QWORD PTR [rcx+80]
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+32]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+32], rdi
+ mov QWORD PTR [rcx+-72], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-64]
+ xor r9, QWORD PTR [rcx+-8]
+ xor r10, QWORD PTR [rcx+88]
+ xor rdx, QWORD PTR [rcx+-56]
+ xor rax, QWORD PTR [rcx+40]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-64], r11
+ mov QWORD PTR [rcx+-8], r12
+ mov QWORD PTR [rcx+88], r13
+ mov QWORD PTR [rcx+-56], r14
+ mov QWORD PTR [rcx+40], r15
+ ; Round 7
+ xor r11, rsi
+ xor r11, QWORD PTR [rcx+-88]
+ xor r11, QWORD PTR [rcx+-80]
+ xor r11, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor r12, QWORD PTR [rcx+-16]
+ xor r12, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor r13, QWORD PTR [rcx+64]
+ xor r13, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor r13, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+8]
+ mov r13, QWORD PTR [rcx+72]
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+40]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-24], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+40], r14
+ ; XOR in constant
+ mov r15, 9223372036854808585
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-48]
+ mov r12, QWORD PTR [rcx+56]
+ mov r13, QWORD PTR [rcx+-80]
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+88]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+88], rdi
+ mov QWORD PTR [rcx+-48], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx]
+ mov r12, QWORD PTR [rcx+64]
+ mov r13, QWORD PTR [rcx+-32]
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+-64]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-64], rdi
+ mov QWORD PTR [rcx], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+48]
+ mov r12, QWORD PTR [rcx+-88]
+ mov r13, QWORD PTR [rcx+16]
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+-56]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-56], rdi
+ mov QWORD PTR [rcx+48], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+96]
+ xor r9, QWORD PTR [rcx+-40]
+ xor r10, QWORD PTR [rcx+24]
+ xor rdx, QWORD PTR [rcx+-72]
+ xor rax, QWORD PTR [rcx+-8]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+96], r11
+ mov QWORD PTR [rcx+-40], r12
+ mov QWORD PTR [rcx+24], r13
+ mov QWORD PTR [rcx+-72], r14
+ mov QWORD PTR [rcx+-8], r15
+ ; Round 8
+ xor r11, rsi
+ xor r12, QWORD PTR [rcx+-88]
+ xor r13, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor r11, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor r11, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+48]
+ xor r12, QWORD PTR [rcx+56]
+ xor r12, QWORD PTR [rcx+64]
+ xor r13, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+56]
+ mov r13, QWORD PTR [rcx+-32]
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+-8]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+80], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-8], r14
+ ; XOR in constant
+ xor rsi, 138
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-24]
+ mov r12, QWORD PTR [rcx+88]
+ mov r13, QWORD PTR [rcx]
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+24]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+24], rdi
+ mov QWORD PTR [rcx+-24], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+8]
+ mov r12, QWORD PTR [rcx+-80]
+ mov r13, QWORD PTR [rcx+32]
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+96]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+96], rdi
+ mov QWORD PTR [rcx+8], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+40]
+ mov r12, QWORD PTR [rcx+-48]
+ mov r13, QWORD PTR [rcx+64]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-72]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-72], rdi
+ mov QWORD PTR [rcx+40], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+72]
+ xor r9, QWORD PTR [rcx+-16]
+ xor r10, QWORD PTR [rcx+-64]
+ xor rdx, QWORD PTR [rcx+48]
+ xor rax, QWORD PTR [rcx+-40]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+72], r11
+ mov QWORD PTR [rcx+-16], r12
+ mov QWORD PTR [rcx+-64], r13
+ mov QWORD PTR [rcx+48], r14
+ mov QWORD PTR [rcx+-40], r15
+ ; Round 9
+ xor r11, rsi
+ xor r14, QWORD PTR [rcx+-88]
+ xor r12, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-56]
+ xor r12, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-32]
+ xor r11, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r13, QWORD PTR [rcx]
+ xor r11, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+32]
+ xor r11, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+56]
+ xor r13, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+80]
+ xor r12, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+88]
+ mov r13, QWORD PTR [rcx+32]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-40]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+16], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-40], r14
+ ; XOR in constant
+ xor rsi, 136
+ ; Row 1
+ mov r11, QWORD PTR [rcx+80]
+ mov r12, QWORD PTR [rcx+24]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+-64]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-64], rdi
+ mov QWORD PTR [rcx+80], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+56]
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+-56]
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+72]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+72], rdi
+ mov QWORD PTR [rcx+56], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-8]
+ mov r12, QWORD PTR [rcx+-24]
+ mov r13, QWORD PTR [rcx+-80]
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+48]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+48], rdi
+ mov QWORD PTR [rcx+-8], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-32]
+ xor r9, QWORD PTR [rcx+-88]
+ xor r10, QWORD PTR [rcx+96]
+ xor rdx, QWORD PTR [rcx+40]
+ xor rax, QWORD PTR [rcx+-16]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-32], r11
+ mov QWORD PTR [rcx+-88], r12
+ mov QWORD PTR [rcx+96], r13
+ mov QWORD PTR [rcx+40], r14
+ mov QWORD PTR [rcx+-16], r15
+ ; Round 10
+ xor r11, rsi
+ xor r13, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-72]
+ xor r15, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-24]
+ xor r11, QWORD PTR [rcx+-8]
+ xor r12, QWORD PTR [rcx]
+ xor r13, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor r12, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+48]
+ xor r11, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor r11, QWORD PTR [rcx+80]
+ xor r12, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+24]
+ mov r13, QWORD PTR [rcx+-56]
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+-16]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+64], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-16], r14
+ ; XOR in constant
+ mov r15, 2147516425
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+16]
+ mov r12, QWORD PTR [rcx+-64]
+ mov r13, QWORD PTR [rcx+56]
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+96]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+96], rdi
+ mov QWORD PTR [rcx+16], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+88]
+ mov r12, QWORD PTR [rcx+8]
+ mov r13, QWORD PTR [rcx+-72]
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+-32]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-32], rdi
+ mov QWORD PTR [rcx+88], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-40]
+ mov r12, QWORD PTR [rcx+80]
+ mov r13, QWORD PTR [rcx]
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+40]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+40], rdi
+ mov QWORD PTR [rcx+-40], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+32]
+ xor r9, QWORD PTR [rcx+-48]
+ xor r10, QWORD PTR [rcx+72]
+ xor rdx, QWORD PTR [rcx+-8]
+ xor rax, QWORD PTR [rcx+-88]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+32], r11
+ mov QWORD PTR [rcx+-48], r12
+ mov QWORD PTR [rcx+72], r13
+ mov QWORD PTR [rcx+-8], r14
+ mov QWORD PTR [rcx+-88], r15
+ ; Round 11
+ xor r11, rsi
+ xor r14, QWORD PTR [rcx+-80]
+ xor r13, QWORD PTR [rcx+-72]
+ xor r12, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-56]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+8]
+ xor r11, QWORD PTR [rcx+16]
+ xor r12, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+80]
+ xor r11, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-64]
+ mov r13, QWORD PTR [rcx+-72]
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+-88]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-80], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-88], r14
+ ; XOR in constant
+ mov r15, 2147483658
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+64]
+ mov r12, QWORD PTR [rcx+96]
+ mov r13, QWORD PTR [rcx+88]
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+72]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+72], rdi
+ mov QWORD PTR [rcx+64], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+24]
+ mov r12, QWORD PTR [rcx+56]
+ mov r13, QWORD PTR [rcx+48]
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+32]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+32], rdi
+ mov QWORD PTR [rcx+24], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-16]
+ mov r12, QWORD PTR [rcx+16]
+ mov r13, QWORD PTR [rcx+8]
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+-8]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-8], rdi
+ mov QWORD PTR [rcx+-16], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-56]
+ xor r9, QWORD PTR [rcx+-24]
+ xor r10, QWORD PTR [rcx+-32]
+ xor rdx, QWORD PTR [rcx+-40]
+ xor rax, QWORD PTR [rcx+-48]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-56], r11
+ mov QWORD PTR [rcx+-24], r12
+ mov QWORD PTR [rcx+-32], r13
+ mov QWORD PTR [rcx+-40], r14
+ mov QWORD PTR [rcx+-48], r15
+ ; Round 12
+ xor r11, rsi
+ xor r15, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-80]
+ xor r13, QWORD PTR [rcx+-72]
+ xor r12, QWORD PTR [rcx+-64]
+ xor r11, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx]
+ xor r13, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+16]
+ xor r11, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor r14, QWORD PTR [rcx+40]
+ xor r13, QWORD PTR [rcx+48]
+ xor r12, QWORD PTR [rcx+56]
+ xor r11, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+80]
+ xor r13, QWORD PTR [rcx+88]
+ xor r12, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+96]
+ mov r13, QWORD PTR [rcx+48]
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+-48]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-48], r14
+ ; XOR in constant
+ mov r15, 2147516555
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-80]
+ mov r12, QWORD PTR [rcx+72]
+ mov r13, QWORD PTR [rcx+24]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-32]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-32], rdi
+ mov QWORD PTR [rcx+-80], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-64]
+ mov r12, QWORD PTR [rcx+88]
+ mov r13, QWORD PTR [rcx+40]
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+-56]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-56], rdi
+ mov QWORD PTR [rcx+-64], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-88]
+ mov r12, QWORD PTR [rcx+64]
+ mov r13, QWORD PTR [rcx+56]
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+-40]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-40], rdi
+ mov QWORD PTR [rcx+-88], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-72]
+ xor r9, QWORD PTR [rcx+80]
+ xor r10, QWORD PTR [rcx+32]
+ xor rdx, QWORD PTR [rcx+-16]
+ xor rax, QWORD PTR [rcx+-24]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-72], r11
+ mov QWORD PTR [rcx+80], r12
+ mov QWORD PTR [rcx+32], r13
+ mov QWORD PTR [rcx+-16], r14
+ mov QWORD PTR [rcx+-24], r15
+ ; Round 13
+ xor r11, rsi
+ xor r11, QWORD PTR [rcx+-88]
+ xor r11, QWORD PTR [rcx+-80]
+ xor r11, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+40]
+ xor r13, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+56]
+ xor r12, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r12, QWORD PTR [rcx+88]
+ xor r12, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+72]
+ mov r13, QWORD PTR [rcx+40]
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+-24]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+8], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-24], r14
+ ; XOR in constant
+ mov r15, 9223372036854775947
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx]
+ mov r12, QWORD PTR [rcx+-32]
+ mov r13, QWORD PTR [rcx+-64]
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+32]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+32], rdi
+ mov QWORD PTR [rcx], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+96]
+ mov r12, QWORD PTR [rcx+24]
+ mov r13, QWORD PTR [rcx+-8]
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+-72]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-72], rdi
+ mov QWORD PTR [rcx+96], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-48]
+ mov r12, QWORD PTR [rcx+-80]
+ mov r13, QWORD PTR [rcx+88]
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+-16]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-16], rdi
+ mov QWORD PTR [rcx+-48], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+48]
+ xor r9, QWORD PTR [rcx+16]
+ xor r10, QWORD PTR [rcx+-56]
+ xor rdx, QWORD PTR [rcx+-88]
+ xor rax, QWORD PTR [rcx+80]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+48], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+-56], r13
+ mov QWORD PTR [rcx+-88], r14
+ mov QWORD PTR [rcx+80], r15
+ ; Round 14
+ xor r11, rsi
+ xor r12, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor r13, QWORD PTR [rcx+-64]
+ xor r11, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx+-8]
+ xor r11, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor r13, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+88]
+ xor r11, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-32]
+ mov r13, QWORD PTR [rcx+-8]
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+80]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+56], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+80], r14
+ ; XOR in constant
+ mov r15, 9223372036854808713
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+8]
+ mov r12, QWORD PTR [rcx+32]
+ mov r13, QWORD PTR [rcx+96]
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+-56]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-56], rdi
+ mov QWORD PTR [rcx+8], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+72]
+ mov r12, QWORD PTR [rcx+-64]
+ mov r13, QWORD PTR [rcx+-40]
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+48]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+48], rdi
+ mov QWORD PTR [rcx+72], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-24]
+ mov r12, QWORD PTR [rcx]
+ mov r13, QWORD PTR [rcx+24]
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+-88]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-88], rdi
+ mov QWORD PTR [rcx+-24], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+40]
+ xor r9, QWORD PTR [rcx+64]
+ xor r10, QWORD PTR [rcx+-72]
+ xor rdx, QWORD PTR [rcx+-48]
+ xor rax, QWORD PTR [rcx+16]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+40], r11
+ mov QWORD PTR [rcx+64], r12
+ mov QWORD PTR [rcx+-72], r13
+ mov QWORD PTR [rcx+-48], r14
+ mov QWORD PTR [rcx+16], r15
+ ; Round 15
+ xor r11, rsi
+ xor r15, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-80]
+ xor r12, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor r13, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r11, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx+-8]
+ xor r12, QWORD PTR [rcx]
+ xor r11, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+24]
+ xor r12, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+56]
+ xor r11, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor r13, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+32]
+ mov r13, QWORD PTR [rcx+-40]
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+16]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+88], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+16], r14
+ ; XOR in constant
+ mov r15, 9223372036854808579
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+56]
+ mov r12, QWORD PTR [rcx+-56]
+ mov r13, QWORD PTR [rcx+72]
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+-72]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-72], rdi
+ mov QWORD PTR [rcx+56], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-32]
+ mov r12, QWORD PTR [rcx+96]
+ mov r13, QWORD PTR [rcx+-16]
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+40]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+40], rdi
+ mov QWORD PTR [rcx+-32], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+80]
+ mov r12, QWORD PTR [rcx+8]
+ mov r13, QWORD PTR [rcx+-64]
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+-48]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-48], rdi
+ mov QWORD PTR [rcx+80], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-8]
+ xor r9, QWORD PTR [rcx+-80]
+ xor r10, QWORD PTR [rcx+48]
+ xor rdx, QWORD PTR [rcx+-24]
+ xor rax, QWORD PTR [rcx+64]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-8], r11
+ mov QWORD PTR [rcx+-80], r12
+ mov QWORD PTR [rcx+48], r13
+ mov QWORD PTR [rcx+-24], r14
+ mov QWORD PTR [rcx+64], r15
+ ; Round 16
+ xor r11, rsi
+ xor r14, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-72]
+ xor r13, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-40]
+ xor r11, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor r12, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+56]
+ xor r13, QWORD PTR [rcx+72]
+ xor r11, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor r12, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-56]
+ mov r13, QWORD PTR [rcx+-16]
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+64]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+24], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+64], r14
+ ; XOR in constant
+ mov r15, 9223372036854808578
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+88]
+ mov r12, QWORD PTR [rcx+-72]
+ mov r13, QWORD PTR [rcx+-32]
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+48]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+48], rdi
+ mov QWORD PTR [rcx+88], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+32]
+ mov r12, QWORD PTR [rcx+72]
+ mov r13, QWORD PTR [rcx+-88]
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+-8]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-8], rdi
+ mov QWORD PTR [rcx+32], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+16]
+ mov r12, QWORD PTR [rcx+56]
+ mov r13, QWORD PTR [rcx+96]
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+-24]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-24], rdi
+ mov QWORD PTR [rcx+16], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-40]
+ xor r9, QWORD PTR [rcx]
+ xor r10, QWORD PTR [rcx+40]
+ xor rdx, QWORD PTR [rcx+80]
+ xor rax, QWORD PTR [rcx+-80]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-40], r11
+ mov QWORD PTR [rcx], r12
+ mov QWORD PTR [rcx+40], r13
+ mov QWORD PTR [rcx+80], r14
+ mov QWORD PTR [rcx+-80], r15
+ ; Round 17
+ xor r11, rsi
+ xor r13, QWORD PTR [rcx+-88]
+ xor r12, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r13, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx+8]
+ xor r11, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+48]
+ xor r12, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r11, QWORD PTR [rcx+88]
+ xor r13, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-72]
+ mov r13, QWORD PTR [rcx+-88]
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+-80]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-64], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-80], r14
+ ; XOR in constant
+ mov r15, 9223372036854775936
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+24]
+ mov r12, QWORD PTR [rcx+48]
+ mov r13, QWORD PTR [rcx+32]
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+40]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+40], rdi
+ mov QWORD PTR [rcx+24], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-56]
+ mov r12, QWORD PTR [rcx+-32]
+ mov r13, QWORD PTR [rcx+-48]
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+-40]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-40], rdi
+ mov QWORD PTR [rcx+-56], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+64]
+ mov r12, QWORD PTR [rcx+88]
+ mov r13, QWORD PTR [rcx+72]
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+80]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+80], rdi
+ mov QWORD PTR [rcx+64], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-16]
+ xor r9, QWORD PTR [rcx+8]
+ xor r10, QWORD PTR [rcx+-8]
+ xor rdx, QWORD PTR [rcx+16]
+ xor rax, QWORD PTR [rcx]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-16], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+-8], r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx], r15
+ ; Round 18
+ xor r11, rsi
+ xor r13, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor r12, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-64]
+ xor r11, QWORD PTR [rcx+-56]
+ xor r13, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor r11, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+56]
+ xor r11, QWORD PTR [rcx+64]
+ xor r13, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor r12, QWORD PTR [rcx+88]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+48]
+ mov r13, QWORD PTR [rcx+-48]
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+96], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx], r14
+ ; XOR in constant
+ xor rsi, 32778
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-64]
+ mov r12, QWORD PTR [rcx+40]
+ mov r13, QWORD PTR [rcx+-56]
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+-8]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+88], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-8], rdi
+ mov QWORD PTR [rcx+-64], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-72]
+ mov r12, QWORD PTR [rcx+32]
+ mov r13, QWORD PTR [rcx+-24]
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+-16]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-16], rdi
+ mov QWORD PTR [rcx+-72], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+-80]
+ mov r12, QWORD PTR [rcx+24]
+ mov r13, QWORD PTR [rcx+-32]
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+16]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+16], rdi
+ mov QWORD PTR [rcx+-80], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-88]
+ xor r9, QWORD PTR [rcx+56]
+ xor r10, QWORD PTR [rcx+-40]
+ xor rdx, QWORD PTR [rcx+64]
+ xor rax, QWORD PTR [rcx+8]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-88], r11
+ mov QWORD PTR [rcx+56], r12
+ mov QWORD PTR [rcx+-40], r13
+ mov QWORD PTR [rcx+64], r14
+ mov QWORD PTR [rcx+8], r15
+ ; Round 19
+ xor r11, rsi
+ xor r11, QWORD PTR [rcx+-80]
+ xor r11, QWORD PTR [rcx+-72]
+ xor r11, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-56]
+ xor r13, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+16]
+ xor r12, QWORD PTR [rcx+24]
+ xor r12, QWORD PTR [rcx+32]
+ xor r12, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+40]
+ mov r13, QWORD PTR [rcx+-24]
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+8]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-24], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+72], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+8], r14
+ ; XOR in constant
+ mov r15, 9223372039002259466
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+96]
+ mov r12, QWORD PTR [rcx+-8]
+ mov r13, QWORD PTR [rcx+-72]
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+-40]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+24], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-40], rdi
+ mov QWORD PTR [rcx+96], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+48]
+ mov r12, QWORD PTR [rcx+-56]
+ mov r13, QWORD PTR [rcx+80]
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-88]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-88], rdi
+ mov QWORD PTR [rcx+48], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx]
+ mov r12, QWORD PTR [rcx+-64]
+ mov r13, QWORD PTR [rcx+32]
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+64]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+64], rdi
+ mov QWORD PTR [rcx], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-48]
+ xor r9, QWORD PTR [rcx+88]
+ xor r10, QWORD PTR [rcx+-16]
+ xor rdx, QWORD PTR [rcx+-80]
+ xor rax, QWORD PTR [rcx+56]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-48], r11
+ mov QWORD PTR [rcx+88], r12
+ mov QWORD PTR [rcx+-16], r13
+ mov QWORD PTR [rcx+-80], r14
+ mov QWORD PTR [rcx+56], r15
+ ; Round 20
+ xor r11, rsi
+ xor r15, QWORD PTR [rcx+-88]
+ xor r13, QWORD PTR [rcx+-72]
+ xor r12, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor r12, QWORD PTR [rcx+-8]
+ xor r11, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+32]
+ xor r12, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor r11, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-8]
+ mov r13, QWORD PTR [rcx+80]
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+56]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-32], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+56], r14
+ ; XOR in constant
+ mov r15, 9223372039002292353
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+72]
+ mov r12, QWORD PTR [rcx+-40]
+ mov r13, QWORD PTR [rcx+48]
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+-16]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-16], rdi
+ mov QWORD PTR [rcx+72], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+40]
+ mov r12, QWORD PTR [rcx+-72]
+ mov r13, QWORD PTR [rcx+16]
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+-48]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-48], rdi
+ mov QWORD PTR [rcx+40], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+8]
+ mov r12, QWORD PTR [rcx+96]
+ mov r13, QWORD PTR [rcx+-56]
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+-80]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+8], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+-24]
+ xor r9, QWORD PTR [rcx+24]
+ xor r10, QWORD PTR [rcx+-88]
+ xor rdx, QWORD PTR [rcx]
+ xor rax, QWORD PTR [rcx+88]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+-24], r11
+ mov QWORD PTR [rcx+24], r12
+ mov QWORD PTR [rcx+-88], r13
+ mov QWORD PTR [rcx], r14
+ mov QWORD PTR [rcx+88], r15
+ ; Round 21
+ xor r11, rsi
+ xor r15, QWORD PTR [rcx+-80]
+ xor r12, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor r12, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r12, QWORD PTR [rcx+-8]
+ xor r11, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+32]
+ xor r11, QWORD PTR [rcx+40]
+ xor r13, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor r11, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor r12, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-40]
+ mov r13, QWORD PTR [rcx+16]
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+88]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+16], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+32], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+88], r14
+ ; XOR in constant
+ mov r15, 9223372036854808704
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-32]
+ mov r12, QWORD PTR [rcx+-16]
+ mov r13, QWORD PTR [rcx+40]
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+-88]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+96], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-88], rdi
+ mov QWORD PTR [rcx+-32], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-8]
+ mov r12, QWORD PTR [rcx+48]
+ mov r13, QWORD PTR [rcx+64]
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+-24]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-24], rdi
+ mov QWORD PTR [rcx+-8], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+56]
+ mov r12, QWORD PTR [rcx+72]
+ mov r13, QWORD PTR [rcx+-72]
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-56], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+56], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+80]
+ xor r9, QWORD PTR [rcx+-64]
+ xor r10, QWORD PTR [rcx+-48]
+ xor rdx, QWORD PTR [rcx+8]
+ xor rax, QWORD PTR [rcx+24]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+80], r11
+ mov QWORD PTR [rcx+-64], r12
+ mov QWORD PTR [rcx+-48], r13
+ mov QWORD PTR [rcx+8], r14
+ mov QWORD PTR [rcx+24], r15
+ ; Round 22
+ xor r11, rsi
+ xor r15, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-80]
+ xor r13, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-56]
+ xor r12, QWORD PTR [rcx+-40]
+ xor r11, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r12, QWORD PTR [rcx+-16]
+ xor r11, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor r13, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+32]
+ xor r13, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+48]
+ xor r11, QWORD PTR [rcx+56]
+ xor r13, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+88]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-16]
+ mov r13, QWORD PTR [rcx+64]
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+24]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-16], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+64], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-56], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+24], r14
+ ; XOR in constant
+ mov r15, 2147483649
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+32]
+ mov r12, QWORD PTR [rcx+-88]
+ mov r13, QWORD PTR [rcx+-8]
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+-48]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-48], rdi
+ mov QWORD PTR [rcx+32], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-40]
+ mov r12, QWORD PTR [rcx+40]
+ mov r13, QWORD PTR [rcx+-80]
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+80]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+80], rdi
+ mov QWORD PTR [rcx+-40], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+88]
+ mov r12, QWORD PTR [rcx+-32]
+ mov r13, QWORD PTR [rcx+48]
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+8]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-72], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+8], rdi
+ mov QWORD PTR [rcx+88], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+16]
+ xor r9, QWORD PTR [rcx+96]
+ xor r10, QWORD PTR [rcx+-24]
+ xor rdx, QWORD PTR [rcx+56]
+ xor rax, QWORD PTR [rcx+-64]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+16], r11
+ mov QWORD PTR [rcx+96], r12
+ mov QWORD PTR [rcx+-24], r13
+ mov QWORD PTR [rcx+56], r14
+ mov QWORD PTR [rcx+-64], r15
+ ; Round 23
+ xor r11, rsi
+ xor r12, QWORD PTR [rcx+-88]
+ xor r13, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r12, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+32]
+ xor r12, QWORD PTR [rcx+40]
+ xor r13, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor r11, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rdx, r12, 63
+ rorx rax, r13, 63
+ rorx r8, r14, 63
+ rorx r9, r15, 63
+ rorx r10, r11, 63
+ xor rdx, r15
+ xor rax, r11
+ xor r8, r12
+ xor r9, r13
+ xor r10, r14
+ ; Row Mix
+ ; Row 0
+ mov r11, rsi
+ mov r12, QWORD PTR [rcx+-88]
+ mov r13, QWORD PTR [rcx+-80]
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+-64]
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ rol r12, 44
+ rol r13, 43
+ rol r14, 21
+ rol r15, 14
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-88], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-80], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-72], rdi
+ andn r14, r11, r12
+ andn rsi, r12, r13
+ xor r14, r15
+ xor rsi, r11
+ mov QWORD PTR [rcx+-64], r14
+ ; XOR in constant
+ mov r15, 9223372039002292232
+ xor rsi, r15
+ ; Row 1
+ mov r11, QWORD PTR [rcx+-56]
+ mov r12, QWORD PTR [rcx+-48]
+ mov r13, QWORD PTR [rcx+-40]
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+-24]
+ xor r11, r9
+ xor r12, r10
+ xor r13, rdx
+ xor r14, rax
+ xor r15, r8
+ rol r11, 28
+ rol r12, 20
+ rol r13, 3
+ rol r14, 45
+ rol r15, 61
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-48], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+-40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+-32], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+-24], rdi
+ mov QWORD PTR [rcx+-56], r14
+ ; Row 2
+ mov r11, QWORD PTR [rcx+-16]
+ mov r12, QWORD PTR [rcx+-8]
+ mov r13, QWORD PTR [rcx]
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+16]
+ xor r11, rax
+ xor r12, r8
+ xor r13, r9
+ xor r14, r10
+ xor r15, rdx
+ rol r11, 1
+ rol r12, 6
+ rol r13, 25
+ rol r14, 8
+ rol r15, 18
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+-8], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+8], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+16], rdi
+ mov QWORD PTR [rcx+-16], r14
+ ; Row 3
+ mov r11, QWORD PTR [rcx+24]
+ mov r12, QWORD PTR [rcx+32]
+ mov r13, QWORD PTR [rcx+40]
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+56]
+ xor r11, r10
+ xor r12, rdx
+ xor r13, rax
+ xor r14, r8
+ xor r15, r9
+ rol r11, 27
+ rol r12, 36
+ rol r13, 10
+ rol r14, 15
+ rol r15, 56
+ andn rdi, r13, r14
+ xor rdi, r12
+ mov QWORD PTR [rcx+32], rdi
+ andn rdi, r14, r15
+ xor rdi, r13
+ mov QWORD PTR [rcx+40], rdi
+ andn rdi, r15, r11
+ xor rdi, r14
+ mov QWORD PTR [rcx+48], rdi
+ andn rdi, r11, r12
+ andn r14, r12, r13
+ xor rdi, r15
+ xor r14, r11
+ mov QWORD PTR [rcx+56], rdi
+ mov QWORD PTR [rcx+24], r14
+ ; Row 4
+ xor r8, QWORD PTR [rcx+64]
+ xor r9, QWORD PTR [rcx+72]
+ xor r10, QWORD PTR [rcx+80]
+ xor rdx, QWORD PTR [rcx+88]
+ xor rax, QWORD PTR [rcx+96]
+ rorx r11, r8, 2
+ rorx r12, r9, 9
+ rorx r13, r10, 25
+ rorx r14, rdx, 23
+ rorx r15, rax, 62
+ andn rdx, r12, r13
+ andn rax, r13, r14
+ andn r8, r14, r15
+ andn r9, r15, r11
+ andn r10, r11, r12
+ xor r11, rdx
+ xor r12, rax
+ xor r13, r8
+ xor r14, r9
+ xor r15, r10
+ mov QWORD PTR [rcx+64], r11
+ mov QWORD PTR [rcx+72], r12
+ mov QWORD PTR [rcx+80], r13
+ mov QWORD PTR [rcx+88], r14
+ mov QWORD PTR [rcx+96], r15
+ mov QWORD PTR [rcx+-96], rsi
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+sha3_block_bmi2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_block_n_bmi2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ push r9
+ mov rbp, r9
+ mov r9, QWORD PTR [rcx]
+ add rcx, 96
+L_sha3_block_n_bmi2_start:
+ cmp rbp, 136
+ je L_sha3_block_n_bmi2_load_256
+ cmp rbp, 168
+ je L_sha3_block_n_bmi2_load_128
+ cmp rbp, 144
+ je L_sha3_block_n_bmi2_load_224
+ cmp rbp, 104
+ je L_sha3_block_n_bmi2_load_384
+ mov r14, QWORD PTR [rdx]
+ mov r15, QWORD PTR [rdx+8]
+ mov rdi, QWORD PTR [rdx+16]
+ mov rsi, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [rdx+32]
+ mov rax, QWORD PTR [rdx+40]
+ mov r10, QWORD PTR [rdx+48]
+ mov r11, QWORD PTR [rdx+56]
+ mov r12, QWORD PTR [rdx+64]
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ mov r9, r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ mov QWORD PTR [rcx+-56], rax
+ mov QWORD PTR [rcx+-48], r10
+ mov QWORD PTR [rcx+-40], r11
+ mov QWORD PTR [rcx+-32], r12
+ jmp L_sha3_block_n_bmi2_rounds
+L_sha3_block_n_bmi2_load_128:
+ mov r14, QWORD PTR [rdx]
+ mov r15, QWORD PTR [rdx+8]
+ mov rdi, QWORD PTR [rdx+16]
+ mov rsi, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [rdx+32]
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ mov r9, r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ mov rax, QWORD PTR [rdx+40]
+ mov r10, QWORD PTR [rdx+48]
+ mov r11, QWORD PTR [rdx+56]
+ mov r12, QWORD PTR [rdx+64]
+ mov r13, QWORD PTR [rdx+72]
+ mov rbp, QWORD PTR [rdx+80]
+ xor rax, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor rbp, QWORD PTR [rcx+-16]
+ mov QWORD PTR [rcx+-56], rax
+ mov QWORD PTR [rcx+-48], r10
+ mov QWORD PTR [rcx+-40], r11
+ mov QWORD PTR [rcx+-32], r12
+ mov QWORD PTR [rcx+-24], r13
+ mov QWORD PTR [rcx+-16], rbp
+ mov rax, QWORD PTR [rdx+88]
+ mov r10, QWORD PTR [rdx+96]
+ mov r11, QWORD PTR [rdx+104]
+ mov r12, QWORD PTR [rdx+112]
+ mov r13, QWORD PTR [rdx+120]
+ mov rbp, QWORD PTR [rdx+128]
+ xor rax, QWORD PTR [rcx+-8]
+ xor r10, QWORD PTR [rcx]
+ xor r11, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+24]
+ xor rbp, QWORD PTR [rcx+32]
+ mov QWORD PTR [rcx+-8], rax
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ mov QWORD PTR [rcx+32], rbp
+ mov rax, QWORD PTR [rdx+136]
+ mov r10, QWORD PTR [rdx+144]
+ mov r11, QWORD PTR [rdx+152]
+ mov r12, QWORD PTR [rdx+160]
+ xor rax, QWORD PTR [rcx+40]
+ xor r10, QWORD PTR [rcx+48]
+ xor r11, QWORD PTR [rcx+56]
+ xor r12, QWORD PTR [rcx+64]
+ mov QWORD PTR [rcx+40], rax
+ mov QWORD PTR [rcx+48], r10
+ mov QWORD PTR [rcx+56], r11
+ mov QWORD PTR [rcx+64], r12
+ jmp L_sha3_block_n_bmi2_rounds
+L_sha3_block_n_bmi2_load_224:
+ mov r14, QWORD PTR [rdx+40]
+ mov r15, QWORD PTR [rdx+48]
+ mov rdi, QWORD PTR [rdx+56]
+ mov rsi, QWORD PTR [rdx+64]
+ mov rbx, QWORD PTR [rdx+72]
+ mov rax, QWORD PTR [rdx+80]
+ mov r10, QWORD PTR [rdx+88]
+ mov r11, QWORD PTR [rdx+96]
+ mov r12, QWORD PTR [rdx+104]
+ mov r13, QWORD PTR [rdx+112]
+ xor r14, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-40]
+ xor rsi, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-24]
+ xor rax, QWORD PTR [rcx+-16]
+ xor r10, QWORD PTR [rcx+-8]
+ xor r11, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+16]
+ mov QWORD PTR [rcx+-56], r14
+ mov QWORD PTR [rcx+-48], r15
+ mov QWORD PTR [rcx+-40], rdi
+ mov QWORD PTR [rcx+-32], rsi
+ mov QWORD PTR [rcx+-24], rbx
+ mov QWORD PTR [rcx+-16], rax
+ mov QWORD PTR [rcx+-8], r10
+ mov QWORD PTR [rcx], r11
+ mov QWORD PTR [rcx+8], r12
+ mov QWORD PTR [rcx+16], r13
+ mov r14, QWORD PTR [rdx]
+ mov r15, QWORD PTR [rdx+8]
+ mov rdi, QWORD PTR [rdx+16]
+ mov rsi, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [rdx+32]
+ mov rax, QWORD PTR [rdx+120]
+ mov r10, QWORD PTR [rdx+128]
+ mov r11, QWORD PTR [rdx+136]
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+24]
+ xor r10, QWORD PTR [rcx+32]
+ xor r11, QWORD PTR [rcx+40]
+ mov r9, r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ mov QWORD PTR [rcx+24], rax
+ mov QWORD PTR [rcx+32], r10
+ mov QWORD PTR [rcx+40], r11
+ jmp L_sha3_block_n_bmi2_rounds
+L_sha3_block_n_bmi2_load_384:
+ mov r14, QWORD PTR [rdx]
+ mov r15, QWORD PTR [rdx+8]
+ mov rdi, QWORD PTR [rdx+16]
+ mov rsi, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [rdx+32]
+ mov rax, QWORD PTR [rdx+40]
+ mov r10, QWORD PTR [rdx+48]
+ mov r11, QWORD PTR [rdx+56]
+ mov r12, QWORD PTR [rdx+64]
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ mov r9, r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ mov QWORD PTR [rcx+-56], rax
+ mov QWORD PTR [rcx+-48], r10
+ mov QWORD PTR [rcx+-40], r11
+ mov QWORD PTR [rcx+-32], r12
+ mov rax, QWORD PTR [rdx+72]
+ mov r10, QWORD PTR [rdx+80]
+ mov r11, QWORD PTR [rdx+88]
+ mov r12, QWORD PTR [rdx+96]
+ xor rax, QWORD PTR [rcx+-24]
+ xor r10, QWORD PTR [rcx+-16]
+ xor r11, QWORD PTR [rcx+-8]
+ xor r12, QWORD PTR [rcx]
+ mov QWORD PTR [rcx+-24], rax
+ mov QWORD PTR [rcx+-16], r10
+ mov QWORD PTR [rcx+-8], r11
+ mov QWORD PTR [rcx], r12
+ jmp L_sha3_block_n_bmi2_rounds
+L_sha3_block_n_bmi2_load_256:
+ mov r14, QWORD PTR [rdx]
+ mov r15, QWORD PTR [rdx+8]
+ mov rdi, QWORD PTR [rdx+16]
+ mov rsi, QWORD PTR [rdx+24]
+ mov rbx, QWORD PTR [rdx+32]
+ mov rax, QWORD PTR [rdx+40]
+ mov r10, QWORD PTR [rdx+48]
+ mov r11, QWORD PTR [rdx+56]
+ mov r12, QWORD PTR [rdx+64]
+ mov r13, QWORD PTR [rdx+72]
+ mov rbp, QWORD PTR [rdx+80]
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+-48]
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+-24]
+ xor rbp, QWORD PTR [rcx+-16]
+ mov r9, r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ mov QWORD PTR [rcx+-56], rax
+ mov QWORD PTR [rcx+-48], r10
+ mov QWORD PTR [rcx+-40], r11
+ mov QWORD PTR [rcx+-32], r12
+ mov QWORD PTR [rcx+-24], r13
+ mov QWORD PTR [rcx+-16], rbp
+ mov rax, QWORD PTR [rdx+88]
+ mov r10, QWORD PTR [rdx+96]
+ mov r11, QWORD PTR [rdx+104]
+ mov r12, QWORD PTR [rdx+112]
+ mov r13, QWORD PTR [rdx+120]
+ mov rbp, QWORD PTR [rdx+128]
+ xor rax, QWORD PTR [rcx+-8]
+ xor r10, QWORD PTR [rcx]
+ xor r11, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+24]
+ xor rbp, QWORD PTR [rcx+32]
+ mov QWORD PTR [rcx+-8], rax
+ mov QWORD PTR [rcx], r10
+ mov QWORD PTR [rcx+8], r11
+ mov QWORD PTR [rcx+16], r12
+ mov QWORD PTR [rcx+24], r13
+ mov QWORD PTR [rcx+32], rbp
+L_sha3_block_n_bmi2_rounds:
+ ; Round 0
+ xor r14, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-40]
+ xor rsi, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor rdi, QWORD PTR [rcx]
+ xor rsi, QWORD PTR [rcx+8]
+ xor rbx, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor rdi, QWORD PTR [rcx+40]
+ xor rsi, QWORD PTR [rcx+48]
+ xor rbx, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+88]
+ xor rbx, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-48]
+ mov rdi, QWORD PTR [rcx]
+ mov rsi, QWORD PTR [rcx+48]
+ mov rbx, QWORD PTR [rcx+96]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+48], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+96], rsi
+ ; XOR in constant
+ xor r9, 1
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+-24]
+ mov rdi, QWORD PTR [rcx+-16]
+ mov rsi, QWORD PTR [rcx+32]
+ mov rbx, QWORD PTR [rcx+80]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+80], rbp
+ mov QWORD PTR [rcx+-72], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+-40]
+ mov rdi, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rcx+56]
+ mov rbx, QWORD PTR [rcx+64]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+64], rbp
+ mov QWORD PTR [rcx+-88], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+-56]
+ mov rdi, QWORD PTR [rcx+-8]
+ mov rsi, QWORD PTR [rcx+40]
+ mov rbx, QWORD PTR [rcx+88]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+88], rbp
+ mov QWORD PTR [rcx+-64], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-80]
+ xor r12, QWORD PTR [rcx+-32]
+ xor r13, QWORD PTR [rcx+16]
+ xor rax, QWORD PTR [rcx+24]
+ xor r10, QWORD PTR [rcx+72]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-80], r14
+ mov QWORD PTR [rcx+-32], r15
+ mov QWORD PTR [rcx+16], rdi
+ mov QWORD PTR [rcx+24], rsi
+ mov QWORD PTR [rcx+72], rbx
+ ; Round 1
+ xor r14, r9
+ xor r14, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-24]
+ xor rdi, QWORD PTR [rcx+-16]
+ xor rdi, QWORD PTR [rcx+-8]
+ xor rdi, QWORD PTR [rcx]
+ xor rdi, QWORD PTR [rcx+8]
+ xor rsi, QWORD PTR [rcx+32]
+ xor rsi, QWORD PTR [rcx+40]
+ xor rsi, QWORD PTR [rcx+48]
+ xor rsi, QWORD PTR [rcx+56]
+ xor rbx, QWORD PTR [rcx+64]
+ xor rbx, QWORD PTR [rcx+80]
+ xor rbx, QWORD PTR [rcx+88]
+ xor rbx, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-24]
+ mov rdi, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rcx+40]
+ mov rbx, QWORD PTR [rcx+72]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+40], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+72], rsi
+ ; XOR in constant
+ xor r9, 32898
+ ; Row 1
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+80]
+ mov rdi, QWORD PTR [rcx+-88]
+ mov rsi, QWORD PTR [rcx+-56]
+ mov rbx, QWORD PTR [rcx+16]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+16], rbp
+ mov QWORD PTR [rcx+48], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+-16]
+ mov rdi, QWORD PTR [rcx+56]
+ mov rsi, QWORD PTR [rcx+88]
+ mov rbx, QWORD PTR [rcx+-80]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-80], rbp
+ mov QWORD PTR [rcx+-48], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+-72]
+ mov rdi, QWORD PTR [rcx+-40]
+ mov rsi, QWORD PTR [rcx+-8]
+ mov rbx, QWORD PTR [rcx+24]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+24], rbp
+ mov QWORD PTR [rcx+96], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx]
+ xor r12, QWORD PTR [rcx+32]
+ xor r13, QWORD PTR [rcx+64]
+ xor rax, QWORD PTR [rcx+-64]
+ xor r10, QWORD PTR [rcx+-32]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx], r14
+ mov QWORD PTR [rcx+32], r15
+ mov QWORD PTR [rcx+64], rdi
+ mov QWORD PTR [rcx+-64], rsi
+ mov QWORD PTR [rcx+-32], rbx
+ ; Round 2
+ xor r14, r9
+ xor rdi, QWORD PTR [rcx+-88]
+ xor rbx, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor rsi, QWORD PTR [rcx+-8]
+ xor rdi, QWORD PTR [rcx+8]
+ xor rbx, QWORD PTR [rcx+16]
+ xor rbx, QWORD PTR [rcx+24]
+ xor rsi, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor rdi, QWORD PTR [rcx+56]
+ xor rbx, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+88]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+80]
+ mov rdi, QWORD PTR [rcx+56]
+ mov rsi, QWORD PTR [rcx+-8]
+ mov rbx, QWORD PTR [rcx+-32]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-32], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808714
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+16]
+ mov rdi, QWORD PTR [rcx+-48]
+ mov rsi, QWORD PTR [rcx+-72]
+ mov rbx, QWORD PTR [rcx+64]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+64], rbp
+ mov QWORD PTR [rcx+40], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+-88]
+ mov rdi, QWORD PTR [rcx+88]
+ mov rsi, QWORD PTR [rcx+24]
+ mov rbx, QWORD PTR [rcx]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx], rbp
+ mov QWORD PTR [rcx+-24], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+48]
+ mov rdi, QWORD PTR [rcx+-16]
+ mov rsi, QWORD PTR [rcx+-40]
+ mov rbx, QWORD PTR [rcx+-64]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-64], rbp
+ mov QWORD PTR [rcx+72], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+8]
+ xor r12, QWORD PTR [rcx+-56]
+ xor r13, QWORD PTR [rcx+-80]
+ xor rax, QWORD PTR [rcx+96]
+ xor r10, QWORD PTR [rcx+32]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+8], r14
+ mov QWORD PTR [rcx+-56], r15
+ mov QWORD PTR [rcx+-80], rdi
+ mov QWORD PTR [rcx+96], rsi
+ mov QWORD PTR [rcx+32], rbx
+ ; Round 3
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rdi, QWORD PTR [rcx+-48]
+ xor rsi, QWORD PTR [rcx+-40]
+ xor rbx, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor rdi, QWORD PTR [rcx+-16]
+ xor rsi, QWORD PTR [rcx+-8]
+ xor rbx, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+24]
+ xor r14, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+48]
+ xor rdi, QWORD PTR [rcx+56]
+ xor rbx, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor rdi, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+16]
+ mov rdi, QWORD PTR [rcx+88]
+ mov rsi, QWORD PTR [rcx+-40]
+ mov rbx, QWORD PTR [rcx+32]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+32], rsi
+ ; XOR in constant
+ mov rbx, 9223372039002292224
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+64]
+ mov rdi, QWORD PTR [rcx+-24]
+ mov rsi, QWORD PTR [rcx+48]
+ mov rbx, QWORD PTR [rcx+-80]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-80], rbp
+ mov QWORD PTR [rcx+-8], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+-48]
+ mov rdi, QWORD PTR [rcx+24]
+ mov rsi, QWORD PTR [rcx+-64]
+ mov rbx, QWORD PTR [rcx+8]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+8], rbp
+ mov QWORD PTR [rcx+80], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+40]
+ mov rdi, QWORD PTR [rcx+-88]
+ mov rsi, QWORD PTR [rcx+-16]
+ mov rbx, QWORD PTR [rcx+96]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+96], rbp
+ mov QWORD PTR [rcx+-32], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+56]
+ xor r12, QWORD PTR [rcx+-72]
+ xor r13, QWORD PTR [rcx]
+ xor rax, QWORD PTR [rcx+72]
+ xor r10, QWORD PTR [rcx+-56]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+56], r14
+ mov QWORD PTR [rcx+-72], r15
+ mov QWORD PTR [rcx], rdi
+ mov QWORD PTR [rcx+72], rsi
+ mov QWORD PTR [rcx+-56], rbx
+ ; Round 4
+ xor r14, r9
+ xor rdi, QWORD PTR [rcx+-88]
+ xor rbx, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-48]
+ xor rsi, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor rdi, QWORD PTR [rcx+-24]
+ xor rsi, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx+-8]
+ xor rbx, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor rdi, QWORD PTR [rcx+24]
+ xor rbx, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor rsi, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+80]
+ xor rdi, QWORD PTR [rcx+88]
+ xor rbx, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+64]
+ mov rdi, QWORD PTR [rcx+24]
+ mov rsi, QWORD PTR [rcx+-16]
+ mov rbx, QWORD PTR [rcx+-56]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-56], rsi
+ ; XOR in constant
+ xor r9, 32907
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+-80]
+ mov rdi, QWORD PTR [rcx+80]
+ mov rsi, QWORD PTR [rcx+40]
+ mov rbx, QWORD PTR [rcx]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx], rbp
+ mov QWORD PTR [rcx+-40], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-24]
+ mov rdi, QWORD PTR [rcx+-64]
+ mov rsi, QWORD PTR [rcx+96]
+ mov rbx, QWORD PTR [rcx+56]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+56], rbp
+ mov QWORD PTR [rcx+16], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+-8]
+ mov rdi, QWORD PTR [rcx+-48]
+ mov rsi, QWORD PTR [rcx+-88]
+ mov rbx, QWORD PTR [rcx+72]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+72], rbp
+ mov QWORD PTR [rcx+32], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+88]
+ xor r12, QWORD PTR [rcx+48]
+ xor r13, QWORD PTR [rcx+8]
+ xor rax, QWORD PTR [rcx+-32]
+ xor r10, QWORD PTR [rcx+-72]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+88], r14
+ mov QWORD PTR [rcx+48], r15
+ mov QWORD PTR [rcx+8], rdi
+ mov QWORD PTR [rcx+-32], rsi
+ mov QWORD PTR [rcx+-72], rbx
+ ; Round 5
+ xor r14, r9
+ xor rsi, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor rdi, QWORD PTR [rcx+-64]
+ xor rbx, QWORD PTR [rcx+-56]
+ xor rdi, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-24]
+ xor rsi, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor rbx, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+16]
+ xor rdi, QWORD PTR [rcx+24]
+ xor r14, QWORD PTR [rcx+32]
+ xor rsi, QWORD PTR [rcx+40]
+ xor rbx, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor rbx, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-80]
+ mov rdi, QWORD PTR [rcx+-64]
+ mov rsi, QWORD PTR [rcx+-88]
+ mov rbx, QWORD PTR [rcx+-72]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-72], rsi
+ ; XOR in constant
+ mov rbx, 2147483649
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx]
+ mov rdi, QWORD PTR [rcx+16]
+ mov rsi, QWORD PTR [rcx+-8]
+ mov rbx, QWORD PTR [rcx+8]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+8], rbp
+ mov QWORD PTR [rcx+-16], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+80]
+ mov rdi, QWORD PTR [rcx+96]
+ mov rsi, QWORD PTR [rcx+72]
+ mov rbx, QWORD PTR [rcx+88]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+88], rbp
+ mov QWORD PTR [rcx+64], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+-40]
+ mov rdi, QWORD PTR [rcx+-24]
+ mov rsi, QWORD PTR [rcx+-48]
+ mov rbx, QWORD PTR [rcx+-32]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-32], rbp
+ mov QWORD PTR [rcx+-56], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+24]
+ xor r12, QWORD PTR [rcx+40]
+ xor r13, QWORD PTR [rcx+56]
+ xor rax, QWORD PTR [rcx+32]
+ xor r10, QWORD PTR [rcx+48]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+24], r14
+ mov QWORD PTR [rcx+40], r15
+ mov QWORD PTR [rcx+56], rdi
+ mov QWORD PTR [rcx+32], rsi
+ mov QWORD PTR [rcx+48], rbx
+ ; Round 6
+ xor r14, r9
+ xor rsi, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor rbx, QWORD PTR [rcx+-72]
+ xor rdi, QWORD PTR [rcx+-64]
+ xor r14, QWORD PTR [rcx+-56]
+ xor rsi, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor rbx, QWORD PTR [rcx+-32]
+ xor rdi, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-16]
+ xor rsi, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor rbx, QWORD PTR [rcx+8]
+ xor rdi, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+64]
+ xor rsi, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+80]
+ xor rbx, QWORD PTR [rcx+88]
+ xor rdi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx]
+ mov rdi, QWORD PTR [rcx+96]
+ mov rsi, QWORD PTR [rcx+-48]
+ mov rbx, QWORD PTR [rcx+48]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+48], rsi
+ ; XOR in constant
+ mov rbx, 9223372039002292353
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+8]
+ mov rdi, QWORD PTR [rcx+64]
+ mov rsi, QWORD PTR [rcx+-40]
+ mov rbx, QWORD PTR [rcx+56]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+56], rbp
+ mov QWORD PTR [rcx+-88], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+16]
+ mov rdi, QWORD PTR [rcx+72]
+ mov rsi, QWORD PTR [rcx+-32]
+ mov rbx, QWORD PTR [rcx+24]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+24], rbp
+ mov QWORD PTR [rcx+-80], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+-16]
+ mov rdi, QWORD PTR [rcx+80]
+ mov rsi, QWORD PTR [rcx+-24]
+ mov rbx, QWORD PTR [rcx+32]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+32], rbp
+ mov QWORD PTR [rcx+-72], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-64]
+ xor r12, QWORD PTR [rcx+-8]
+ xor r13, QWORD PTR [rcx+88]
+ xor rax, QWORD PTR [rcx+-56]
+ xor r10, QWORD PTR [rcx+40]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-64], r14
+ mov QWORD PTR [rcx+-8], r15
+ mov QWORD PTR [rcx+88], rdi
+ mov QWORD PTR [rcx+-56], rsi
+ mov QWORD PTR [rcx+40], rbx
+ ; Round 7
+ xor r14, r9
+ xor r14, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-48]
+ xor rsi, QWORD PTR [rcx+-40]
+ xor rsi, QWORD PTR [rcx+-32]
+ xor rsi, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor rbx, QWORD PTR [rcx+24]
+ xor rbx, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+48]
+ xor rbx, QWORD PTR [rcx+56]
+ xor rdi, QWORD PTR [rcx+64]
+ xor rdi, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+80]
+ xor rdi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+8]
+ mov rdi, QWORD PTR [rcx+72]
+ mov rsi, QWORD PTR [rcx+-24]
+ mov rbx, QWORD PTR [rcx+40]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+40], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808585
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+56]
+ mov rdi, QWORD PTR [rcx+-80]
+ mov rsi, QWORD PTR [rcx+-16]
+ mov rbx, QWORD PTR [rcx+88]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+88], rbp
+ mov QWORD PTR [rcx+-48], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+64]
+ mov rdi, QWORD PTR [rcx+-32]
+ mov rsi, QWORD PTR [rcx+32]
+ mov rbx, QWORD PTR [rcx+-64]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-64], rbp
+ mov QWORD PTR [rcx], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+-88]
+ mov rdi, QWORD PTR [rcx+16]
+ mov rsi, QWORD PTR [rcx+80]
+ mov rbx, QWORD PTR [rcx+-56]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-56], rbp
+ mov QWORD PTR [rcx+48], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+96]
+ xor r12, QWORD PTR [rcx+-40]
+ xor r13, QWORD PTR [rcx+24]
+ xor rax, QWORD PTR [rcx+-72]
+ xor r10, QWORD PTR [rcx+-8]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+96], r14
+ mov QWORD PTR [rcx+-40], r15
+ mov QWORD PTR [rcx+24], rdi
+ mov QWORD PTR [rcx+-72], rsi
+ mov QWORD PTR [rcx+-8], rbx
+ ; Round 8
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rbx, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-32]
+ xor rsi, QWORD PTR [rcx+-24]
+ xor rsi, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor rdi, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor rdi, QWORD PTR [rcx+72]
+ xor rsi, QWORD PTR [rcx+80]
+ xor rbx, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+56]
+ mov rdi, QWORD PTR [rcx+-32]
+ mov rsi, QWORD PTR [rcx+80]
+ mov rbx, QWORD PTR [rcx+-8]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+80], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-8], rsi
+ ; XOR in constant
+ xor r9, 138
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx+88]
+ mov rdi, QWORD PTR [rcx]
+ mov rsi, QWORD PTR [rcx+-88]
+ mov rbx, QWORD PTR [rcx+24]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+24], rbp
+ mov QWORD PTR [rcx+-24], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+-80]
+ mov rdi, QWORD PTR [rcx+32]
+ mov rsi, QWORD PTR [rcx+-56]
+ mov rbx, QWORD PTR [rcx+96]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+96], rbp
+ mov QWORD PTR [rcx+8], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+-48]
+ mov rdi, QWORD PTR [rcx+64]
+ mov rsi, QWORD PTR [rcx+16]
+ mov rbx, QWORD PTR [rcx+-72]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-72], rbp
+ mov QWORD PTR [rcx+40], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+72]
+ xor r12, QWORD PTR [rcx+-16]
+ xor r13, QWORD PTR [rcx+-64]
+ xor rax, QWORD PTR [rcx+48]
+ xor r10, QWORD PTR [rcx+-40]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+72], r14
+ mov QWORD PTR [rcx+-16], r15
+ mov QWORD PTR [rcx+-64], rdi
+ mov QWORD PTR [rcx+48], rsi
+ mov QWORD PTR [rcx+-40], rbx
+ ; Round 9
+ xor r14, r9
+ xor rsi, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-80]
+ xor rbx, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor rbx, QWORD PTR [rcx+-8]
+ xor rdi, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+8]
+ xor rsi, QWORD PTR [rcx+16]
+ xor rbx, QWORD PTR [rcx+24]
+ xor rdi, QWORD PTR [rcx+32]
+ xor r14, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+56]
+ xor rdi, QWORD PTR [rcx+64]
+ xor rsi, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ xor rbx, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+88]
+ mov rdi, QWORD PTR [rcx+32]
+ mov rsi, QWORD PTR [rcx+16]
+ mov rbx, QWORD PTR [rcx+-40]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+16], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-40], rsi
+ ; XOR in constant
+ xor r9, 136
+ ; Row 1
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rcx+-48]
+ mov rbx, QWORD PTR [rcx+-64]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-64], rbp
+ mov QWORD PTR [rcx+80], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx]
+ mov rdi, QWORD PTR [rcx+-56]
+ mov rsi, QWORD PTR [rcx+-72]
+ mov rbx, QWORD PTR [rcx+72]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+72], rbp
+ mov QWORD PTR [rcx+56], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+-24]
+ mov rdi, QWORD PTR [rcx+-80]
+ mov rsi, QWORD PTR [rcx+64]
+ mov rbx, QWORD PTR [rcx+48]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+48], rbp
+ mov QWORD PTR [rcx+-8], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-32]
+ xor r12, QWORD PTR [rcx+-88]
+ xor r13, QWORD PTR [rcx+96]
+ xor rax, QWORD PTR [rcx+40]
+ xor r10, QWORD PTR [rcx+-16]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-32], r14
+ mov QWORD PTR [rcx+-88], r15
+ mov QWORD PTR [rcx+96], rdi
+ mov QWORD PTR [rcx+40], rsi
+ mov QWORD PTR [rcx+-16], rbx
+ ; Round 10
+ xor r14, r9
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rbx, QWORD PTR [rcx+-64]
+ xor rdi, QWORD PTR [rcx+-56]
+ xor rsi, QWORD PTR [rcx+-48]
+ xor rbx, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor rdi, QWORD PTR [rcx+8]
+ xor rsi, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor rdi, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+56]
+ xor rsi, QWORD PTR [rcx+64]
+ xor rbx, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [rcx+-56]
+ mov rsi, QWORD PTR [rcx+64]
+ mov rbx, QWORD PTR [rcx+-16]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+64], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-16], rsi
+ ; XOR in constant
+ mov rbx, 2147516425
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+-64]
+ mov rdi, QWORD PTR [rcx+56]
+ mov rsi, QWORD PTR [rcx+-24]
+ mov rbx, QWORD PTR [rcx+96]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+96], rbp
+ mov QWORD PTR [rcx+16], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+8]
+ mov rdi, QWORD PTR [rcx+-72]
+ mov rsi, QWORD PTR [rcx+48]
+ mov rbx, QWORD PTR [rcx+-32]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-32], rbp
+ mov QWORD PTR [rcx+88], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+80]
+ mov rdi, QWORD PTR [rcx]
+ mov rsi, QWORD PTR [rcx+-80]
+ mov rbx, QWORD PTR [rcx+40]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+40], rbp
+ mov QWORD PTR [rcx+-40], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+32]
+ xor r12, QWORD PTR [rcx+-48]
+ xor r13, QWORD PTR [rcx+72]
+ xor rax, QWORD PTR [rcx+-8]
+ xor r10, QWORD PTR [rcx+-88]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+32], r14
+ mov QWORD PTR [rcx+-48], r15
+ mov QWORD PTR [rcx+72], rdi
+ mov QWORD PTR [rcx+-8], rsi
+ mov QWORD PTR [rcx+-88], rbx
+ ; Round 11
+ xor r14, r9
+ xor rsi, QWORD PTR [rcx+-80]
+ xor rdi, QWORD PTR [rcx+-72]
+ xor r15, QWORD PTR [rcx+-64]
+ xor rdi, QWORD PTR [rcx+-56]
+ xor r14, QWORD PTR [rcx+-40]
+ xor rbx, QWORD PTR [rcx+-32]
+ xor rsi, QWORD PTR [rcx+-24]
+ xor rbx, QWORD PTR [rcx+-16]
+ xor rdi, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor rbx, QWORD PTR [rcx+40]
+ xor rsi, QWORD PTR [rcx+48]
+ xor rdi, QWORD PTR [rcx+56]
+ xor rsi, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ xor rbx, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-64]
+ mov rdi, QWORD PTR [rcx+-72]
+ mov rsi, QWORD PTR [rcx+-80]
+ mov rbx, QWORD PTR [rcx+-88]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-88], rsi
+ ; XOR in constant
+ mov rbx, 2147483658
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+96]
+ mov rdi, QWORD PTR [rcx+88]
+ mov rsi, QWORD PTR [rcx+80]
+ mov rbx, QWORD PTR [rcx+72]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+72], rbp
+ mov QWORD PTR [rcx+64], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+56]
+ mov rdi, QWORD PTR [rcx+48]
+ mov rsi, QWORD PTR [rcx+40]
+ mov rbx, QWORD PTR [rcx+32]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+32], rbp
+ mov QWORD PTR [rcx+24], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+16]
+ mov rdi, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rcx]
+ mov rbx, QWORD PTR [rcx+-8]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-8], rbp
+ mov QWORD PTR [rcx+-16], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-56]
+ xor r12, QWORD PTR [rcx+-24]
+ xor r13, QWORD PTR [rcx+-32]
+ xor rax, QWORD PTR [rcx+-40]
+ xor r10, QWORD PTR [rcx+-48]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-56], r14
+ mov QWORD PTR [rcx+-24], r15
+ mov QWORD PTR [rcx+-32], rdi
+ mov QWORD PTR [rcx+-40], rsi
+ mov QWORD PTR [rcx+-48], rbx
+ ; Round 12
+ xor r14, r9
+ xor rbx, QWORD PTR [rcx+-88]
+ xor rsi, QWORD PTR [rcx+-80]
+ xor rdi, QWORD PTR [rcx+-72]
+ xor r15, QWORD PTR [rcx+-64]
+ xor r14, QWORD PTR [rcx+-16]
+ xor rbx, QWORD PTR [rcx+-8]
+ xor rsi, QWORD PTR [rcx]
+ xor rdi, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+16]
+ xor r14, QWORD PTR [rcx+24]
+ xor rbx, QWORD PTR [rcx+32]
+ xor rsi, QWORD PTR [rcx+40]
+ xor rdi, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor rbx, QWORD PTR [rcx+72]
+ xor rsi, QWORD PTR [rcx+80]
+ xor rdi, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+96]
+ mov rdi, QWORD PTR [rcx+48]
+ mov rsi, QWORD PTR [rcx]
+ mov rbx, QWORD PTR [rcx+-48]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-48], rsi
+ ; XOR in constant
+ mov rbx, 2147516555
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+72]
+ mov rdi, QWORD PTR [rcx+24]
+ mov rsi, QWORD PTR [rcx+16]
+ mov rbx, QWORD PTR [rcx+-32]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-32], rbp
+ mov QWORD PTR [rcx+-80], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+88]
+ mov rdi, QWORD PTR [rcx+40]
+ mov rsi, QWORD PTR [rcx+-8]
+ mov rbx, QWORD PTR [rcx+-56]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-56], rbp
+ mov QWORD PTR [rcx+-64], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-88]
+ mov r15, QWORD PTR [rcx+64]
+ mov rdi, QWORD PTR [rcx+56]
+ mov rsi, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+-40]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-40], rbp
+ mov QWORD PTR [rcx+-88], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-72]
+ xor r12, QWORD PTR [rcx+80]
+ xor r13, QWORD PTR [rcx+32]
+ xor rax, QWORD PTR [rcx+-16]
+ xor r10, QWORD PTR [rcx+-24]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-72], r14
+ mov QWORD PTR [rcx+80], r15
+ mov QWORD PTR [rcx+32], rdi
+ mov QWORD PTR [rcx+-16], rsi
+ mov QWORD PTR [rcx+-24], rbx
+ ; Round 13
+ xor r14, r9
+ xor r14, QWORD PTR [rcx+-88]
+ xor r14, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-64]
+ xor rbx, QWORD PTR [rcx+-56]
+ xor rbx, QWORD PTR [rcx+-48]
+ xor rbx, QWORD PTR [rcx+-40]
+ xor rbx, QWORD PTR [rcx+-32]
+ xor rsi, QWORD PTR [rcx+-8]
+ xor rsi, QWORD PTR [rcx]
+ xor rsi, QWORD PTR [rcx+8]
+ xor rsi, QWORD PTR [rcx+16]
+ xor rdi, QWORD PTR [rcx+24]
+ xor rdi, QWORD PTR [rcx+40]
+ xor rdi, QWORD PTR [rcx+48]
+ xor rdi, QWORD PTR [rcx+56]
+ xor r15, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor r15, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+72]
+ mov rdi, QWORD PTR [rcx+40]
+ mov rsi, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+-24]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+8], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-24], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854775947
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+-32]
+ mov rdi, QWORD PTR [rcx+-64]
+ mov rsi, QWORD PTR [rcx+64]
+ mov rbx, QWORD PTR [rcx+32]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+32], rbp
+ mov QWORD PTR [rcx], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [rcx+-8]
+ mov rsi, QWORD PTR [rcx+-40]
+ mov rbx, QWORD PTR [rcx+-72]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-72], rbp
+ mov QWORD PTR [rcx+96], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-48]
+ mov r15, QWORD PTR [rcx+-80]
+ mov rdi, QWORD PTR [rcx+88]
+ mov rsi, QWORD PTR [rcx+56]
+ mov rbx, QWORD PTR [rcx+-16]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-16], rbp
+ mov QWORD PTR [rcx+-48], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+48]
+ xor r12, QWORD PTR [rcx+16]
+ xor r13, QWORD PTR [rcx+-56]
+ xor rax, QWORD PTR [rcx+-88]
+ xor r10, QWORD PTR [rcx+80]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+48], r14
+ mov QWORD PTR [rcx+16], r15
+ mov QWORD PTR [rcx+-56], rdi
+ mov QWORD PTR [rcx+-88], rsi
+ mov QWORD PTR [rcx+80], rbx
+ ; Round 14
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-80]
+ xor rbx, QWORD PTR [rcx+-72]
+ xor rdi, QWORD PTR [rcx+-64]
+ xor r14, QWORD PTR [rcx+-48]
+ xor rsi, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-24]
+ xor rbx, QWORD PTR [rcx+-16]
+ xor rdi, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx]
+ xor rsi, QWORD PTR [rcx+8]
+ xor r15, QWORD PTR [rcx+24]
+ xor rbx, QWORD PTR [rcx+32]
+ xor rdi, QWORD PTR [rcx+40]
+ xor rsi, QWORD PTR [rcx+56]
+ xor rsi, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+88]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-32]
+ mov rdi, QWORD PTR [rcx+-8]
+ mov rsi, QWORD PTR [rcx+56]
+ mov rbx, QWORD PTR [rcx+80]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+56], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+80], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808713
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+32]
+ mov rdi, QWORD PTR [rcx+96]
+ mov rsi, QWORD PTR [rcx+-80]
+ mov rbx, QWORD PTR [rcx+-56]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-56], rbp
+ mov QWORD PTR [rcx+8], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+-64]
+ mov rdi, QWORD PTR [rcx+-40]
+ mov rsi, QWORD PTR [rcx+-16]
+ mov rbx, QWORD PTR [rcx+48]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+48], rbp
+ mov QWORD PTR [rcx+72], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-24]
+ mov r15, QWORD PTR [rcx]
+ mov rdi, QWORD PTR [rcx+24]
+ mov rsi, QWORD PTR [rcx+88]
+ mov rbx, QWORD PTR [rcx+-88]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-88], rbp
+ mov QWORD PTR [rcx+-24], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+40]
+ xor r12, QWORD PTR [rcx+64]
+ xor r13, QWORD PTR [rcx+-72]
+ xor rax, QWORD PTR [rcx+-48]
+ xor r10, QWORD PTR [rcx+16]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+40], r14
+ mov QWORD PTR [rcx+64], r15
+ mov QWORD PTR [rcx+-72], rdi
+ mov QWORD PTR [rcx+-48], rsi
+ mov QWORD PTR [rcx+16], rbx
+ ; Round 15
+ xor r14, r9
+ xor rbx, QWORD PTR [rcx+-88]
+ xor rsi, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-64]
+ xor rbx, QWORD PTR [rcx+-56]
+ xor rdi, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r14, QWORD PTR [rcx+-24]
+ xor rsi, QWORD PTR [rcx+-16]
+ xor rdi, QWORD PTR [rcx+-8]
+ xor r15, QWORD PTR [rcx]
+ xor r14, QWORD PTR [rcx+8]
+ xor rdi, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+48]
+ xor rsi, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+72]
+ xor rbx, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+88]
+ xor rdi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+32]
+ mov rdi, QWORD PTR [rcx+-40]
+ mov rsi, QWORD PTR [rcx+88]
+ mov rbx, QWORD PTR [rcx+16]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+88], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+16], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808579
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+-56]
+ mov rdi, QWORD PTR [rcx+72]
+ mov rsi, QWORD PTR [rcx]
+ mov rbx, QWORD PTR [rcx+-72]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-72], rbp
+ mov QWORD PTR [rcx+56], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+96]
+ mov rdi, QWORD PTR [rcx+-16]
+ mov rsi, QWORD PTR [rcx+-88]
+ mov rbx, QWORD PTR [rcx+40]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+40], rbp
+ mov QWORD PTR [rcx+-32], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+80]
+ mov r15, QWORD PTR [rcx+8]
+ mov rdi, QWORD PTR [rcx+-64]
+ mov rsi, QWORD PTR [rcx+24]
+ mov rbx, QWORD PTR [rcx+-48]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-48], rbp
+ mov QWORD PTR [rcx+80], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-8]
+ xor r12, QWORD PTR [rcx+-80]
+ xor r13, QWORD PTR [rcx+48]
+ xor rax, QWORD PTR [rcx+-24]
+ xor r10, QWORD PTR [rcx+64]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-8], r14
+ mov QWORD PTR [rcx+-80], r15
+ mov QWORD PTR [rcx+48], rdi
+ mov QWORD PTR [rcx+-24], rsi
+ mov QWORD PTR [rcx+64], rbx
+ ; Round 16
+ xor r14, r9
+ xor rsi, QWORD PTR [rcx+-88]
+ xor rbx, QWORD PTR [rcx+-72]
+ xor rdi, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor rbx, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor rdi, QWORD PTR [rcx+-16]
+ xor rsi, QWORD PTR [rcx]
+ xor r15, QWORD PTR [rcx+8]
+ xor rbx, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+56]
+ xor rdi, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+88]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-56]
+ mov rdi, QWORD PTR [rcx+-16]
+ mov rsi, QWORD PTR [rcx+24]
+ mov rbx, QWORD PTR [rcx+64]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+24], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+64], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808578
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+-72]
+ mov rdi, QWORD PTR [rcx+-32]
+ mov rsi, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+48]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+48], rbp
+ mov QWORD PTR [rcx+88], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+72]
+ mov rdi, QWORD PTR [rcx+-88]
+ mov rsi, QWORD PTR [rcx+-48]
+ mov rbx, QWORD PTR [rcx+-8]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-8], rbp
+ mov QWORD PTR [rcx+32], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+16]
+ mov r15, QWORD PTR [rcx+56]
+ mov rdi, QWORD PTR [rcx+96]
+ mov rsi, QWORD PTR [rcx+-64]
+ mov rbx, QWORD PTR [rcx+-24]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-24], rbp
+ mov QWORD PTR [rcx+16], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-40]
+ xor r12, QWORD PTR [rcx]
+ xor r13, QWORD PTR [rcx+40]
+ xor rax, QWORD PTR [rcx+80]
+ xor r10, QWORD PTR [rcx+-80]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-40], r14
+ mov QWORD PTR [rcx], r15
+ mov QWORD PTR [rcx+40], rdi
+ mov QWORD PTR [rcx+80], rsi
+ mov QWORD PTR [rcx+-80], rbx
+ ; Round 17
+ xor r14, r9
+ xor rdi, QWORD PTR [rcx+-88]
+ xor r15, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor rsi, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-24]
+ xor rdi, QWORD PTR [rcx+-16]
+ xor rbx, QWORD PTR [rcx+-8]
+ xor rsi, QWORD PTR [rcx+8]
+ xor r14, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+24]
+ xor r14, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+48]
+ xor r15, QWORD PTR [rcx+56]
+ xor rbx, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor r14, QWORD PTR [rcx+88]
+ xor rdi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-72]
+ mov rdi, QWORD PTR [rcx+-88]
+ mov rsi, QWORD PTR [rcx+-64]
+ mov rbx, QWORD PTR [rcx+-80]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-80], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854775936
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+48]
+ mov rdi, QWORD PTR [rcx+32]
+ mov rsi, QWORD PTR [rcx+56]
+ mov rbx, QWORD PTR [rcx+40]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+40], rbp
+ mov QWORD PTR [rcx+24], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+-32]
+ mov rdi, QWORD PTR [rcx+-48]
+ mov rsi, QWORD PTR [rcx+-24]
+ mov rbx, QWORD PTR [rcx+-40]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-40], rbp
+ mov QWORD PTR [rcx+-56], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+64]
+ mov r15, QWORD PTR [rcx+88]
+ mov rdi, QWORD PTR [rcx+72]
+ mov rsi, QWORD PTR [rcx+96]
+ mov rbx, QWORD PTR [rcx+80]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+80], rbp
+ mov QWORD PTR [rcx+64], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-16]
+ xor r12, QWORD PTR [rcx+8]
+ xor r13, QWORD PTR [rcx+-8]
+ xor rax, QWORD PTR [rcx+16]
+ xor r10, QWORD PTR [rcx]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-16], r14
+ mov QWORD PTR [rcx+8], r15
+ mov QWORD PTR [rcx+-8], rdi
+ mov QWORD PTR [rcx+16], rsi
+ mov QWORD PTR [rcx], rbx
+ ; Round 18
+ xor r14, r9
+ xor rdi, QWORD PTR [rcx+-88]
+ xor rbx, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-64]
+ xor r14, QWORD PTR [rcx+-56]
+ xor rdi, QWORD PTR [rcx+-48]
+ xor rbx, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor rsi, QWORD PTR [rcx+-24]
+ xor r14, QWORD PTR [rcx+24]
+ xor rdi, QWORD PTR [rcx+32]
+ xor rbx, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+48]
+ xor rsi, QWORD PTR [rcx+56]
+ xor r14, QWORD PTR [rcx+64]
+ xor rdi, QWORD PTR [rcx+72]
+ xor rbx, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+88]
+ xor rsi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+48]
+ mov rdi, QWORD PTR [rcx+-48]
+ mov rsi, QWORD PTR [rcx+96]
+ mov rbx, QWORD PTR [rcx]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+96], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx], rsi
+ ; XOR in constant
+ xor r9, 32778
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-64]
+ mov r15, QWORD PTR [rcx+40]
+ mov rdi, QWORD PTR [rcx+-56]
+ mov rsi, QWORD PTR [rcx+88]
+ mov rbx, QWORD PTR [rcx+-8]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+88], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-8], rbp
+ mov QWORD PTR [rcx+-64], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-72]
+ mov r15, QWORD PTR [rcx+32]
+ mov rdi, QWORD PTR [rcx+-24]
+ mov rsi, QWORD PTR [rcx+80]
+ mov rbx, QWORD PTR [rcx+-16]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-16], rbp
+ mov QWORD PTR [rcx+-72], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+-80]
+ mov r15, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [rcx+-32]
+ mov rsi, QWORD PTR [rcx+72]
+ mov rbx, QWORD PTR [rcx+16]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+16], rbp
+ mov QWORD PTR [rcx+-80], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-88]
+ xor r12, QWORD PTR [rcx+56]
+ xor r13, QWORD PTR [rcx+-40]
+ xor rax, QWORD PTR [rcx+64]
+ xor r10, QWORD PTR [rcx+8]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-88], r14
+ mov QWORD PTR [rcx+56], r15
+ mov QWORD PTR [rcx+-40], rdi
+ mov QWORD PTR [rcx+64], rsi
+ mov QWORD PTR [rcx+8], rbx
+ ; Round 19
+ xor r14, r9
+ xor r14, QWORD PTR [rcx+-80]
+ xor r14, QWORD PTR [rcx+-72]
+ xor r14, QWORD PTR [rcx+-64]
+ xor rdi, QWORD PTR [rcx+-56]
+ xor rdi, QWORD PTR [rcx+-48]
+ xor rdi, QWORD PTR [rcx+-32]
+ xor rdi, QWORD PTR [rcx+-24]
+ xor rbx, QWORD PTR [rcx+-16]
+ xor rbx, QWORD PTR [rcx+-8]
+ xor rbx, QWORD PTR [rcx]
+ xor rbx, QWORD PTR [rcx+16]
+ xor r15, QWORD PTR [rcx+24]
+ xor r15, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+48]
+ xor rsi, QWORD PTR [rcx+72]
+ xor rsi, QWORD PTR [rcx+80]
+ xor rsi, QWORD PTR [rcx+88]
+ xor rsi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+40]
+ mov rdi, QWORD PTR [rcx+-24]
+ mov rsi, QWORD PTR [rcx+72]
+ mov rbx, QWORD PTR [rcx+8]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-24], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+72], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+8], rsi
+ ; XOR in constant
+ mov rbx, 9223372039002259466
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+96]
+ mov r15, QWORD PTR [rcx+-8]
+ mov rdi, QWORD PTR [rcx+-72]
+ mov rsi, QWORD PTR [rcx+24]
+ mov rbx, QWORD PTR [rcx+-40]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+24], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-40], rbp
+ mov QWORD PTR [rcx+96], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+48]
+ mov r15, QWORD PTR [rcx+-56]
+ mov rdi, QWORD PTR [rcx+80]
+ mov rsi, QWORD PTR [rcx+16]
+ mov rbx, QWORD PTR [rcx+-88]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-88], rbp
+ mov QWORD PTR [rcx+48], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx]
+ mov r15, QWORD PTR [rcx+-64]
+ mov rdi, QWORD PTR [rcx+32]
+ mov rsi, QWORD PTR [rcx+-32]
+ mov rbx, QWORD PTR [rcx+64]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+64], rbp
+ mov QWORD PTR [rcx], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-48]
+ xor r12, QWORD PTR [rcx+88]
+ xor r13, QWORD PTR [rcx+-16]
+ xor rax, QWORD PTR [rcx+-80]
+ xor r10, QWORD PTR [rcx+56]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-48], r14
+ mov QWORD PTR [rcx+88], r15
+ mov QWORD PTR [rcx+-16], rdi
+ mov QWORD PTR [rcx+-80], rsi
+ mov QWORD PTR [rcx+56], rbx
+ ; Round 20
+ xor r14, r9
+ xor rbx, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-72]
+ xor r15, QWORD PTR [rcx+-64]
+ xor r15, QWORD PTR [rcx+-56]
+ xor rbx, QWORD PTR [rcx+-40]
+ xor rsi, QWORD PTR [rcx+-32]
+ xor rdi, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx]
+ xor rbx, QWORD PTR [rcx+8]
+ xor rsi, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+24]
+ xor rdi, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor r14, QWORD PTR [rcx+48]
+ xor rbx, QWORD PTR [rcx+64]
+ xor rsi, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-8]
+ mov rdi, QWORD PTR [rcx+80]
+ mov rsi, QWORD PTR [rcx+-32]
+ mov rbx, QWORD PTR [rcx+56]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+56], rsi
+ ; XOR in constant
+ mov rbx, 9223372039002292353
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+72]
+ mov r15, QWORD PTR [rcx+-40]
+ mov rdi, QWORD PTR [rcx+48]
+ mov rsi, QWORD PTR [rcx+-64]
+ mov rbx, QWORD PTR [rcx+-16]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-16], rbp
+ mov QWORD PTR [rcx+72], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+40]
+ mov r15, QWORD PTR [rcx+-72]
+ mov rdi, QWORD PTR [rcx+16]
+ mov rsi, QWORD PTR [rcx+64]
+ mov rbx, QWORD PTR [rcx+-48]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-48], rbp
+ mov QWORD PTR [rcx+40], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+8]
+ mov r15, QWORD PTR [rcx+96]
+ mov rdi, QWORD PTR [rcx+-56]
+ mov rsi, QWORD PTR [rcx+32]
+ mov rbx, QWORD PTR [rcx+-80]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-80], rbp
+ mov QWORD PTR [rcx+8], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+-24]
+ xor r12, QWORD PTR [rcx+24]
+ xor r13, QWORD PTR [rcx+-88]
+ xor rax, QWORD PTR [rcx]
+ xor r10, QWORD PTR [rcx+88]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+-24], r14
+ mov QWORD PTR [rcx+24], r15
+ mov QWORD PTR [rcx+-88], rdi
+ mov QWORD PTR [rcx], rsi
+ mov QWORD PTR [rcx+88], rbx
+ ; Round 21
+ xor r14, r9
+ xor rbx, QWORD PTR [rcx+-80]
+ xor r15, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-64]
+ xor rdi, QWORD PTR [rcx+-56]
+ xor rbx, QWORD PTR [rcx+-48]
+ xor r15, QWORD PTR [rcx+-40]
+ xor rsi, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-16]
+ xor r15, QWORD PTR [rcx+-8]
+ xor r14, QWORD PTR [rcx+8]
+ xor rdi, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+32]
+ xor r14, QWORD PTR [rcx+40]
+ xor rdi, QWORD PTR [rcx+48]
+ xor rbx, QWORD PTR [rcx+56]
+ xor rsi, QWORD PTR [rcx+64]
+ xor r14, QWORD PTR [rcx+72]
+ xor rdi, QWORD PTR [rcx+80]
+ xor r15, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-40]
+ mov rdi, QWORD PTR [rcx+16]
+ mov rsi, QWORD PTR [rcx+32]
+ mov rbx, QWORD PTR [rcx+88]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+16], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+32], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+88], rsi
+ ; XOR in constant
+ mov rbx, 9223372036854808704
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-32]
+ mov r15, QWORD PTR [rcx+-16]
+ mov rdi, QWORD PTR [rcx+40]
+ mov rsi, QWORD PTR [rcx+96]
+ mov rbx, QWORD PTR [rcx+-88]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+96], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-88], rbp
+ mov QWORD PTR [rcx+-32], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-8]
+ mov r15, QWORD PTR [rcx+48]
+ mov rdi, QWORD PTR [rcx+64]
+ mov rsi, QWORD PTR [rcx+-80]
+ mov rbx, QWORD PTR [rcx+-24]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-24], rbp
+ mov QWORD PTR [rcx+-8], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+56]
+ mov r15, QWORD PTR [rcx+72]
+ mov rdi, QWORD PTR [rcx+-72]
+ mov rsi, QWORD PTR [rcx+-56]
+ mov rbx, QWORD PTR [rcx]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx], rbp
+ mov QWORD PTR [rcx+56], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+80]
+ xor r12, QWORD PTR [rcx+-64]
+ xor r13, QWORD PTR [rcx+-48]
+ xor rax, QWORD PTR [rcx+8]
+ xor r10, QWORD PTR [rcx+24]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+80], r14
+ mov QWORD PTR [rcx+-64], r15
+ mov QWORD PTR [rcx+-48], rdi
+ mov QWORD PTR [rcx+8], rsi
+ mov QWORD PTR [rcx+24], rbx
+ ; Round 22
+ xor r14, r9
+ xor rbx, QWORD PTR [rcx+-88]
+ xor rsi, QWORD PTR [rcx+-80]
+ xor rdi, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-56]
+ xor r15, QWORD PTR [rcx+-40]
+ xor r14, QWORD PTR [rcx+-32]
+ xor rbx, QWORD PTR [rcx+-24]
+ xor r15, QWORD PTR [rcx+-16]
+ xor r14, QWORD PTR [rcx+-8]
+ xor rbx, QWORD PTR [rcx]
+ xor rdi, QWORD PTR [rcx+16]
+ xor rsi, QWORD PTR [rcx+32]
+ xor rdi, QWORD PTR [rcx+40]
+ xor r15, QWORD PTR [rcx+48]
+ xor r14, QWORD PTR [rcx+56]
+ xor rdi, QWORD PTR [rcx+64]
+ xor r15, QWORD PTR [rcx+72]
+ xor rbx, QWORD PTR [rcx+88]
+ xor rsi, QWORD PTR [rcx+96]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-16]
+ mov rdi, QWORD PTR [rcx+64]
+ mov rsi, QWORD PTR [rcx+-56]
+ mov rbx, QWORD PTR [rcx+24]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-16], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+64], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-56], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+24], rsi
+ ; XOR in constant
+ mov rbx, 2147483649
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+32]
+ mov r15, QWORD PTR [rcx+-88]
+ mov rdi, QWORD PTR [rcx+-8]
+ mov rsi, QWORD PTR [rcx+72]
+ mov rbx, QWORD PTR [rcx+-48]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-48], rbp
+ mov QWORD PTR [rcx+32], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-40]
+ mov r15, QWORD PTR [rcx+40]
+ mov rdi, QWORD PTR [rcx+-80]
+ mov rsi, QWORD PTR [rcx]
+ mov rbx, QWORD PTR [rcx+80]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+80], rbp
+ mov QWORD PTR [rcx+-40], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+88]
+ mov r15, QWORD PTR [rcx+-32]
+ mov rdi, QWORD PTR [rcx+48]
+ mov rsi, QWORD PTR [rcx+-72]
+ mov rbx, QWORD PTR [rcx+8]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+8], rbp
+ mov QWORD PTR [rcx+88], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+16]
+ xor r12, QWORD PTR [rcx+96]
+ xor r13, QWORD PTR [rcx+-24]
+ xor rax, QWORD PTR [rcx+56]
+ xor r10, QWORD PTR [rcx+-64]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+16], r14
+ mov QWORD PTR [rcx+96], r15
+ mov QWORD PTR [rcx+-24], rdi
+ mov QWORD PTR [rcx+56], rsi
+ mov QWORD PTR [rcx+-64], rbx
+ ; Round 23
+ xor r14, r9
+ xor r15, QWORD PTR [rcx+-88]
+ xor rdi, QWORD PTR [rcx+-80]
+ xor rsi, QWORD PTR [rcx+-72]
+ xor rsi, QWORD PTR [rcx+-56]
+ xor rbx, QWORD PTR [rcx+-48]
+ xor r14, QWORD PTR [rcx+-40]
+ xor r15, QWORD PTR [rcx+-32]
+ xor r15, QWORD PTR [rcx+-16]
+ xor rdi, QWORD PTR [rcx+-8]
+ xor rsi, QWORD PTR [rcx]
+ xor rbx, QWORD PTR [rcx+8]
+ xor rbx, QWORD PTR [rcx+24]
+ xor r14, QWORD PTR [rcx+32]
+ xor r15, QWORD PTR [rcx+40]
+ xor rdi, QWORD PTR [rcx+48]
+ xor rdi, QWORD PTR [rcx+64]
+ xor rsi, QWORD PTR [rcx+72]
+ xor rbx, QWORD PTR [rcx+80]
+ xor r14, QWORD PTR [rcx+88]
+ ; Calc t[0..4]
+ rorx rax, r15, 63
+ rorx r10, rdi, 63
+ rorx r11, rsi, 63
+ rorx r12, rbx, 63
+ rorx r13, r14, 63
+ xor rax, rbx
+ xor r10, r14
+ xor r11, r15
+ xor r12, rdi
+ xor r13, rsi
+ ; Row Mix
+ ; Row 0
+ mov r14, r9
+ mov r15, QWORD PTR [rcx+-88]
+ mov rdi, QWORD PTR [rcx+-80]
+ mov rsi, QWORD PTR [rcx+-72]
+ mov rbx, QWORD PTR [rcx+-64]
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ rol r15, 44
+ rol rdi, 43
+ rol rsi, 21
+ rol rbx, 14
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-88], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-80], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-72], rbp
+ andn rsi, r14, r15
+ andn r9, r15, rdi
+ xor rsi, rbx
+ xor r9, r14
+ mov QWORD PTR [rcx+-64], rsi
+ ; XOR in constant
+ mov rbx, 9223372039002292232
+ xor r9, rbx
+ ; Row 1
+ mov r14, QWORD PTR [rcx+-56]
+ mov r15, QWORD PTR [rcx+-48]
+ mov rdi, QWORD PTR [rcx+-40]
+ mov rsi, QWORD PTR [rcx+-32]
+ mov rbx, QWORD PTR [rcx+-24]
+ xor r14, r12
+ xor r15, r13
+ xor rdi, rax
+ xor rsi, r10
+ xor rbx, r11
+ rol r14, 28
+ rol r15, 20
+ rol rdi, 3
+ rol rsi, 45
+ rol rbx, 61
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-48], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+-40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+-32], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+-24], rbp
+ mov QWORD PTR [rcx+-56], rsi
+ ; Row 2
+ mov r14, QWORD PTR [rcx+-16]
+ mov r15, QWORD PTR [rcx+-8]
+ mov rdi, QWORD PTR [rcx]
+ mov rsi, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+16]
+ xor r14, r10
+ xor r15, r11
+ xor rdi, r12
+ xor rsi, r13
+ xor rbx, rax
+ rol r14, 1
+ rol r15, 6
+ rol rdi, 25
+ rol rsi, 8
+ rol rbx, 18
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+-8], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+8], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+16], rbp
+ mov QWORD PTR [rcx+-16], rsi
+ ; Row 3
+ mov r14, QWORD PTR [rcx+24]
+ mov r15, QWORD PTR [rcx+32]
+ mov rdi, QWORD PTR [rcx+40]
+ mov rsi, QWORD PTR [rcx+48]
+ mov rbx, QWORD PTR [rcx+56]
+ xor r14, r13
+ xor r15, rax
+ xor rdi, r10
+ xor rsi, r11
+ xor rbx, r12
+ rol r14, 27
+ rol r15, 36
+ rol rdi, 10
+ rol rsi, 15
+ rol rbx, 56
+ andn rbp, rdi, rsi
+ xor rbp, r15
+ mov QWORD PTR [rcx+32], rbp
+ andn rbp, rsi, rbx
+ xor rbp, rdi
+ mov QWORD PTR [rcx+40], rbp
+ andn rbp, rbx, r14
+ xor rbp, rsi
+ mov QWORD PTR [rcx+48], rbp
+ andn rbp, r14, r15
+ andn rsi, r15, rdi
+ xor rbp, rbx
+ xor rsi, r14
+ mov QWORD PTR [rcx+56], rbp
+ mov QWORD PTR [rcx+24], rsi
+ ; Row 4
+ xor r11, QWORD PTR [rcx+64]
+ xor r12, QWORD PTR [rcx+72]
+ xor r13, QWORD PTR [rcx+80]
+ xor rax, QWORD PTR [rcx+88]
+ xor r10, QWORD PTR [rcx+96]
+ rorx r14, r11, 2
+ rorx r15, r12, 9
+ rorx rdi, r13, 25
+ rorx rsi, rax, 23
+ rorx rbx, r10, 62
+ andn rax, r15, rdi
+ andn r10, rdi, rsi
+ andn r11, rsi, rbx
+ andn r12, rbx, r14
+ andn r13, r14, r15
+ xor r14, rax
+ xor r15, r10
+ xor rdi, r11
+ xor rsi, r12
+ xor rbx, r13
+ mov QWORD PTR [rcx+64], r14
+ mov QWORD PTR [rcx+72], r15
+ mov QWORD PTR [rcx+80], rdi
+ mov QWORD PTR [rcx+88], rsi
+ mov QWORD PTR [rcx+96], rbx
+ add rdx, QWORD PTR [rsp]
+ sub r8d, 1
+ mov rbp, QWORD PTR [rsp]
+ jg L_sha3_block_n_bmi2_start
+ mov QWORD PTR [rcx+-96], r9
+ pop rbp
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+sha3_block_n_bmi2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_block_avx2_rotl QWORD 0000000000000001h, 000000000000003eh
+ QWORD 000000000000001ch, 000000000000001bh
+ QWORD 000000000000002ch, 0000000000000006h
+ QWORD 0000000000000037h, 0000000000000014h
+ QWORD 000000000000000ah, 000000000000002bh
+ QWORD 0000000000000019h, 0000000000000027h
+ QWORD 000000000000002dh, 000000000000000fh
+ QWORD 0000000000000015h, 0000000000000008h
+ QWORD 0000000000000024h, 0000000000000003h
+ QWORD 0000000000000029h, 0000000000000012h
+ QWORD 0000000000000002h, 000000000000003dh
+ QWORD 0000000000000038h, 000000000000000eh
+ptr_L_sha3_block_avx2_rotl QWORD L_sha3_block_avx2_rotl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_block_avx2_rotr QWORD 000000000000003fh, 0000000000000002h
+ QWORD 0000000000000024h, 0000000000000025h
+ QWORD 0000000000000014h, 000000000000003ah
+ QWORD 0000000000000009h, 000000000000002ch
+ QWORD 0000000000000036h, 0000000000000015h
+ QWORD 0000000000000027h, 0000000000000019h
+ QWORD 0000000000000013h, 0000000000000031h
+ QWORD 000000000000002bh, 0000000000000038h
+ QWORD 000000000000001ch, 000000000000003dh
+ QWORD 0000000000000017h, 000000000000002eh
+ QWORD 000000000000003eh, 0000000000000003h
+ QWORD 0000000000000008h, 0000000000000032h
+ptr_L_sha3_block_avx2_rotr QWORD L_sha3_block_avx2_rotr
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_block_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rdx, QWORD PTR [ptr_L_sha3_avx2_r]
+ mov rax, QWORD PTR [ptr_L_sha3_block_avx2_rotl]
+ add rax, 64
+ mov r8, QWORD PTR [ptr_L_sha3_block_avx2_rotr]
+ add r8, 64
+ mov r9, 24
+ vpbroadcastq ymm0, QWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+8]
+ vmovdqu ymm2, YMMWORD PTR [rcx+40]
+ vmovdqu ymm3, YMMWORD PTR [rcx+72]
+ vmovdqu ymm4, YMMWORD PTR [rcx+104]
+ vmovdqu ymm5, YMMWORD PTR [rcx+136]
+ vmovdqu ymm6, YMMWORD PTR [rcx+168]
+ vpermq ymm7, ymm2, 57
+ vpermq ymm8, ymm3, 30
+ vpermq ymm9, ymm4, 75
+ vpermq ymm10, ymm5, 147
+ vpblendd ymm11, ymm2, ymm3, 12
+ vpblendd ymm12, ymm4, ymm5, 192
+ vpblendd ymm2, ymm7, ymm8, 192
+ vpblendd ymm3, ymm8, ymm9, 240
+ vpblendd ymm4, ymm10, ymm9, 3
+ vpblendd ymm5, ymm11, ymm12, 240
+L_sha3_block_avx2_start:
+ ; Calc b[0..4]
+ vpshufd ymm7, ymm5, 238
+ vpxor ymm15, ymm1, ymm2
+ vpxor ymm14, ymm5, ymm7
+ vpxor ymm12, ymm3, ymm4
+ vpermq ymm7, ymm14, 170
+ vpxor ymm14, ymm14, ymm0
+ vpxor ymm14, ymm14, ymm7
+ vpxor ymm15, ymm15, ymm6
+ vpxor ymm15, ymm15, ymm12
+ vpermq ymm14, ymm14, 0
+ ; XOR in b[x+4]
+ vpermq ymm7, ymm15, 147
+ vpermq ymm9, ymm15, 57
+ vpermq ymm10, ymm15, 0
+ vpermq ymm15, ymm15, 255
+ vpblendd ymm9, ymm9, ymm14, 192
+ vpblendd ymm14, ymm7, ymm14, 3
+ ; Rotate left 1
+ vpsrlq ymm8, ymm10, 63
+ vpaddq ymm10, ymm10, ymm10
+ vpsrlq ymm7, ymm9, 63
+ vpaddq ymm9, ymm9, ymm9
+ vpor ymm10, ymm10, ymm8
+ vpor ymm9, ymm9, ymm7
+ vpxor ymm10, ymm10, ymm15
+ vpxor ymm9, ymm9, ymm14
+ ; XOR in ROTL64(b[x+1])
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm9
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm9
+ ; Shuffle - Rotate
+ vmovdqu ymm7, YMMWORD PTR [r8+-64]
+ vmovdqu ymm9, YMMWORD PTR [r8+-32]
+ vmovdqu ymm11, YMMWORD PTR [r8]
+ vmovdqu ymm8, YMMWORD PTR [rax+-64]
+ vmovdqu ymm10, YMMWORD PTR [rax+-32]
+ vmovdqu ymm12, YMMWORD PTR [rax]
+ vpsrlvq ymm7, ymm1, ymm7
+ vpsrlvq ymm9, ymm2, ymm9
+ vpsrlvq ymm11, ymm3, ymm11
+ vpsllvq ymm1, ymm1, ymm8
+ vpsllvq ymm2, ymm2, ymm10
+ vpsllvq ymm3, ymm3, ymm12
+ vpor ymm1, ymm1, ymm7
+ vpor ymm2, ymm2, ymm9
+ vpor ymm3, ymm3, ymm11
+ vmovdqu ymm7, YMMWORD PTR [r8+32]
+ vmovdqu ymm9, YMMWORD PTR [r8+64]
+ vmovdqu ymm11, YMMWORD PTR [r8+96]
+ vmovdqu ymm8, YMMWORD PTR [rax+32]
+ vmovdqu ymm10, YMMWORD PTR [rax+64]
+ vmovdqu ymm12, YMMWORD PTR [rax+96]
+ vpsrlvq ymm7, ymm4, ymm7
+ vpsrlvq ymm9, ymm5, ymm9
+ vpsrlvq ymm11, ymm6, ymm11
+ vpsllvq ymm4, ymm4, ymm8
+ vpsllvq ymm5, ymm5, ymm10
+ vpsllvq ymm6, ymm6, ymm12
+ vpor ymm4, ymm4, ymm7
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm11
+ ; Row Mix
+ vpermq ymm12, ymm2, 0
+ vpermq ymm13, ymm3, 85
+ vpermq ymm14, ymm4, 170
+ vpermq ymm15, ymm6, 255
+ vpandn ymm7, ymm13, ymm14
+ vpandn ymm8, ymm14, ymm15
+ vpandn ymm9, ymm15, ymm0
+ vpandn ymm10, ymm0, ymm12
+ vpandn ymm11, ymm12, ymm13
+ vpxor ymm12, ymm12, ymm7
+ vpxor ymm13, ymm13, ymm8
+ vpxor ymm14, ymm14, ymm9
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm0, ymm0, ymm11
+ vpermq ymm7, ymm5, 141
+ vpblendd ymm10, ymm12, ymm13, 12
+ vpermq ymm11, ymm1, 114
+ vpblendd ymm9, ymm14, ymm15, 192
+ vpermq ymm12, ymm2, 135
+ vpblendd ymm1, ymm10, ymm9, 240
+ vpermq ymm13, ymm3, 201
+ vpermq ymm14, ymm4, 156
+ vpermq ymm15, ymm6, 45
+ vpblendd ymm12, ymm12, ymm7, 48
+ vpblendd ymm13, ymm13, ymm7, 3
+ vpblendd ymm14, ymm14, ymm7, 192
+ vpblendd ymm15, ymm15, ymm7, 12
+ vpandn ymm5, ymm12, ymm13
+ vpandn ymm7, ymm13, ymm14
+ vpandn ymm2, ymm14, ymm15
+ vpandn ymm3, ymm15, ymm11
+ vpandn ymm4, ymm11, ymm12
+ vpxor ymm5, ymm11, ymm5
+ vpxor ymm12, ymm12, ymm7
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vpxor ymm15, ymm15, ymm4
+ vperm2i128 ymm3, ymm12, ymm14, 32
+ vperm2i128 ymm7, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm8, ymm13, ymm15, 49
+ vpunpcklqdq ymm2, ymm3, ymm7
+ vpunpckhqdq ymm3, ymm3, ymm7
+ vpunpcklqdq ymm4, ymm6, ymm8
+ vpunpckhqdq ymm6, ymm6, ymm8
+ vpxor ymm0, ymm0, [rdx]
+ add rdx, 32
+ sub r9, 1
+ jnz L_sha3_block_avx2_start
+ vpermq ymm7, ymm2, 147
+ vpermq ymm8, ymm3, 78
+ vpermq ymm9, ymm4, 57
+ vpblendd ymm2, ymm7, ymm5, 3
+ vpblendd ymm3, ymm8, ymm7, 3
+ vpblendd ymm3, ymm3, ymm5, 12
+ vpblendd ymm4, ymm8, ymm9, 192
+ vpblendd ymm4, ymm4, ymm5, 48
+ vpblendd ymm5, ymm9, ymm5, 192
+ vmovq QWORD PTR [rcx], xmm0
+ vmovdqu YMMWORD PTR [rcx+8], ymm1
+ vmovdqu YMMWORD PTR [rcx+40], ymm2
+ vmovdqu YMMWORD PTR [rcx+72], ymm3
+ vmovdqu YMMWORD PTR [rcx+104], ymm4
+ vmovdqu YMMWORD PTR [rcx+136], ymm5
+ vmovdqu YMMWORD PTR [rcx+168], ymm6
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+sha3_block_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_block_n_avx2_rotl QWORD 0000000000000001h, 000000000000003eh
+ QWORD 000000000000001ch, 000000000000001bh
+ QWORD 000000000000002ch, 0000000000000006h
+ QWORD 0000000000000037h, 0000000000000014h
+ QWORD 000000000000000ah, 000000000000002bh
+ QWORD 0000000000000019h, 0000000000000027h
+ QWORD 000000000000002dh, 000000000000000fh
+ QWORD 0000000000000015h, 0000000000000008h
+ QWORD 0000000000000024h, 0000000000000003h
+ QWORD 0000000000000029h, 0000000000000012h
+ QWORD 0000000000000002h, 000000000000003dh
+ QWORD 0000000000000038h, 000000000000000eh
+ptr_L_sha3_block_n_avx2_rotl QWORD L_sha3_block_n_avx2_rotl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_block_n_avx2_rotr QWORD 000000000000003fh, 0000000000000002h
+ QWORD 0000000000000024h, 0000000000000025h
+ QWORD 0000000000000014h, 000000000000003ah
+ QWORD 0000000000000009h, 000000000000002ch
+ QWORD 0000000000000036h, 0000000000000015h
+ QWORD 0000000000000027h, 0000000000000019h
+ QWORD 0000000000000013h, 0000000000000031h
+ QWORD 000000000000002bh, 0000000000000038h
+ QWORD 000000000000001ch, 000000000000003dh
+ QWORD 0000000000000017h, 000000000000002eh
+ QWORD 000000000000003eh, 0000000000000003h
+ QWORD 0000000000000008h, 0000000000000032h
+ptr_L_sha3_block_n_avx2_rotr QWORD L_sha3_block_n_avx2_rotr
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_block_n_avx2 PROC
+ push r12
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rax, QWORD PTR [ptr_L_sha3_avx2_r]
+ mov r10, QWORD PTR [ptr_L_sha3_block_n_avx2_rotl]
+ add r10, 64
+ mov r11, QWORD PTR [ptr_L_sha3_block_n_avx2_rotr]
+ add r11, 64
+ vpbroadcastq ymm0, QWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+8]
+ vmovdqu ymm2, YMMWORD PTR [rcx+40]
+ vmovdqu ymm3, YMMWORD PTR [rcx+72]
+ vmovdqu ymm4, YMMWORD PTR [rcx+104]
+ vmovdqu ymm5, YMMWORD PTR [rcx+136]
+ vmovdqu ymm6, YMMWORD PTR [rcx+168]
+ mov r12, 24
+ cmp r9, 136
+ je L_sha3_block_n_avx2_load_256_1
+ cmp r9, 168
+ je L_sha3_block_n_avx2_load_128_1
+ cmp r9, 144
+ je L_sha3_block_n_avx2_load_224_1
+ cmp r9, 104
+ je L_sha3_block_n_avx2_load_384_1
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpxor ymm2, ymm2, ymm9
+ jmp L_sha3_block_n_avx2_start_1
+L_sha3_block_n_avx2_load_128_1:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vmovdqu ymm12, YMMWORD PTR [rdx+136]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm10
+ vpxor ymm4, ymm4, ymm11
+ vpxor ymm5, ymm5, ymm12
+ jmp L_sha3_block_n_avx2_start_1
+L_sha3_block_n_avx2_load_224_1:
+ vpxor ymm12, ymm12, ymm12
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vmovq xmm12, QWORD PTR [rdx+136]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm10
+ vpxor ymm4, ymm4, ymm11
+ vpxor ymm5, ymm5, ymm12
+ jmp L_sha3_block_n_avx2_start_1
+L_sha3_block_n_avx2_load_384_1:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm10
+ jmp L_sha3_block_n_avx2_start_1
+L_sha3_block_n_avx2_load_256_1:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm10
+ vpxor ymm4, ymm4, ymm11
+L_sha3_block_n_avx2_start_1:
+ vpermq ymm7, ymm2, 57
+ vpermq ymm8, ymm3, 30
+ vpermq ymm9, ymm4, 75
+ vpermq ymm10, ymm5, 147
+ vpblendd ymm11, ymm2, ymm3, 12
+ vpblendd ymm12, ymm4, ymm5, 192
+ vpblendd ymm2, ymm7, ymm8, 192
+ vpblendd ymm3, ymm8, ymm9, 240
+ vpblendd ymm4, ymm10, ymm9, 3
+ vpblendd ymm5, ymm11, ymm12, 240
+ jmp L_sha3_block_n_avx2_rounds
+L_sha3_block_n_avx2_start:
+ mov r12, 24
+ cmp r9, 136
+ je L_sha3_block_n_avx2_load_256
+ cmp r9, 168
+ je L_sha3_block_n_avx2_load_128
+ cmp r9, 144
+ je L_sha3_block_n_avx2_load_224
+ cmp r9, 104
+ je L_sha3_block_n_avx2_load_384
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vpxor ymm12, ymm12, ymm12
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpermq ymm7, ymm9, 57
+ vpblendd ymm15, ymm9, ymm12, 252
+ vpblendd ymm7, ymm7, ymm12, 192
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm5, ymm5, ymm15
+ jmp L_sha3_block_n_avx2_rounds
+L_sha3_block_n_avx2_load_128:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vmovdqu ymm12, YMMWORD PTR [rdx+136]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpermq ymm7, ymm9, 57
+ vpermq ymm8, ymm10, 30
+ vpermq ymm13, ymm11, 75
+ vpermq ymm14, ymm12, 147
+ vpblendd ymm15, ymm9, ymm10, 12
+ vpblendd ymm11, ymm11, ymm12, 192
+ vpblendd ymm7, ymm7, ymm8, 192
+ vpblendd ymm8, ymm8, ymm13, 240
+ vpblendd ymm13, ymm14, ymm13, 3
+ vpblendd ymm11, ymm15, ymm11, 240
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm8
+ vpxor ymm4, ymm4, ymm13
+ vpxor ymm5, ymm5, ymm11
+ jmp L_sha3_block_n_avx2_rounds
+L_sha3_block_n_avx2_load_224:
+ vpxor ymm12, ymm12, ymm12
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vmovq xmm12, QWORD PTR [rdx+136]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpermq ymm7, ymm9, 57
+ vpermq ymm8, ymm10, 30
+ vpermq ymm13, ymm11, 75
+ vpermq ymm14, ymm12, 147
+ vpblendd ymm15, ymm9, ymm10, 12
+ vpblendd ymm11, ymm11, ymm12, 192
+ vpblendd ymm7, ymm7, ymm8, 192
+ vpblendd ymm8, ymm8, ymm13, 240
+ vpblendd ymm13, ymm14, ymm13, 3
+ vpblendd ymm11, ymm15, ymm11, 240
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm8
+ vpxor ymm4, ymm4, ymm13
+ vpxor ymm5, ymm5, ymm11
+ jmp L_sha3_block_n_avx2_rounds
+L_sha3_block_n_avx2_load_384:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vpxor ymm12, ymm12, ymm12
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpermq ymm7, ymm9, 57
+ vpermq ymm8, ymm10, 30
+ vpblendd ymm13, ymm10, ymm12, 243
+ vpblendd ymm15, ymm9, ymm13, 252
+ vpblendd ymm7, ymm7, ymm8, 192
+ vpblendd ymm8, ymm8, ymm12, 240
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm8
+ vpxor ymm5, ymm5, ymm15
+ jmp L_sha3_block_n_avx2_rounds
+L_sha3_block_n_avx2_load_256:
+ vpbroadcastq ymm7, QWORD PTR [rdx]
+ vmovdqu ymm8, YMMWORD PTR [rdx+8]
+ vmovdqu ymm9, YMMWORD PTR [rdx+40]
+ vmovdqu ymm10, YMMWORD PTR [rdx+72]
+ vmovdqu ymm11, YMMWORD PTR [rdx+104]
+ vpxor ymm12, ymm12, ymm12
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm8
+ vpermq ymm7, ymm9, 57
+ vpermq ymm8, ymm10, 30
+ vpermq ymm13, ymm11, 75
+ vpblendd ymm15, ymm9, ymm10, 12
+ vpblendd ymm11, ymm11, ymm12, 207
+ vpblendd ymm7, ymm7, ymm8, 192
+ vpblendd ymm8, ymm8, ymm13, 240
+ vpblendd ymm13, ymm13, ymm12, 252
+ vpblendd ymm11, ymm15, ymm11, 240
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm8
+ vpxor ymm4, ymm4, ymm13
+ vpxor ymm5, ymm5, ymm11
+L_sha3_block_n_avx2_rounds:
+ ; Calc b[0..4]
+ vpshufd ymm7, ymm5, 238
+ vpxor ymm15, ymm1, ymm2
+ vpxor ymm14, ymm5, ymm7
+ vpxor ymm12, ymm3, ymm4
+ vpermq ymm7, ymm14, 170
+ vpxor ymm14, ymm14, ymm0
+ vpxor ymm14, ymm14, ymm7
+ vpxor ymm15, ymm15, ymm6
+ vpxor ymm15, ymm15, ymm12
+ vpermq ymm14, ymm14, 0
+ ; XOR in b[x+4]
+ vpermq ymm7, ymm15, 147
+ vpermq ymm9, ymm15, 57
+ vpermq ymm10, ymm15, 0
+ vpermq ymm15, ymm15, 255
+ vpblendd ymm9, ymm9, ymm14, 192
+ vpblendd ymm14, ymm7, ymm14, 3
+ ; Rotate left 1
+ vpsrlq ymm8, ymm10, 63
+ vpaddq ymm10, ymm10, ymm10
+ vpsrlq ymm7, ymm9, 63
+ vpaddq ymm9, ymm9, ymm9
+ vpor ymm10, ymm10, ymm8
+ vpor ymm9, ymm9, ymm7
+ vpxor ymm10, ymm10, ymm15
+ vpxor ymm9, ymm9, ymm14
+ ; XOR in ROTL64(b[x+1])
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm9
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm9
+ ; Shuffle - Rotate
+ vmovdqu ymm7, YMMWORD PTR [r11+-64]
+ vmovdqu ymm9, YMMWORD PTR [r11+-32]
+ vmovdqu ymm11, YMMWORD PTR [r11]
+ vmovdqu ymm8, YMMWORD PTR [r10+-64]
+ vmovdqu ymm10, YMMWORD PTR [r10+-32]
+ vmovdqu ymm12, YMMWORD PTR [r10]
+ vpsrlvq ymm7, ymm1, ymm7
+ vpsrlvq ymm9, ymm2, ymm9
+ vpsrlvq ymm11, ymm3, ymm11
+ vpsllvq ymm1, ymm1, ymm8
+ vpsllvq ymm2, ymm2, ymm10
+ vpsllvq ymm3, ymm3, ymm12
+ vpor ymm1, ymm1, ymm7
+ vpor ymm2, ymm2, ymm9
+ vpor ymm3, ymm3, ymm11
+ vmovdqu ymm7, YMMWORD PTR [r11+32]
+ vmovdqu ymm9, YMMWORD PTR [r11+64]
+ vmovdqu ymm11, YMMWORD PTR [r11+96]
+ vmovdqu ymm8, YMMWORD PTR [r10+32]
+ vmovdqu ymm10, YMMWORD PTR [r10+64]
+ vmovdqu ymm12, YMMWORD PTR [r10+96]
+ vpsrlvq ymm7, ymm4, ymm7
+ vpsrlvq ymm9, ymm5, ymm9
+ vpsrlvq ymm11, ymm6, ymm11
+ vpsllvq ymm4, ymm4, ymm8
+ vpsllvq ymm5, ymm5, ymm10
+ vpsllvq ymm6, ymm6, ymm12
+ vpor ymm4, ymm4, ymm7
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm11
+ ; Row Mix
+ vpermq ymm12, ymm2, 0
+ vpermq ymm13, ymm3, 85
+ vpermq ymm14, ymm4, 170
+ vpermq ymm15, ymm6, 255
+ vpandn ymm7, ymm13, ymm14
+ vpandn ymm8, ymm14, ymm15
+ vpandn ymm9, ymm15, ymm0
+ vpandn ymm10, ymm0, ymm12
+ vpandn ymm11, ymm12, ymm13
+ vpxor ymm12, ymm12, ymm7
+ vpxor ymm13, ymm13, ymm8
+ vpxor ymm14, ymm14, ymm9
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm0, ymm0, ymm11
+ vpermq ymm7, ymm5, 141
+ vpblendd ymm10, ymm12, ymm13, 12
+ vpermq ymm11, ymm1, 114
+ vpblendd ymm9, ymm14, ymm15, 192
+ vpermq ymm12, ymm2, 135
+ vpblendd ymm1, ymm10, ymm9, 240
+ vpermq ymm13, ymm3, 201
+ vpermq ymm14, ymm4, 156
+ vpermq ymm15, ymm6, 45
+ vpblendd ymm12, ymm12, ymm7, 48
+ vpblendd ymm13, ymm13, ymm7, 3
+ vpblendd ymm14, ymm14, ymm7, 192
+ vpblendd ymm15, ymm15, ymm7, 12
+ vpandn ymm5, ymm12, ymm13
+ vpandn ymm7, ymm13, ymm14
+ vpandn ymm2, ymm14, ymm15
+ vpandn ymm3, ymm15, ymm11
+ vpandn ymm4, ymm11, ymm12
+ vpxor ymm5, ymm11, ymm5
+ vpxor ymm12, ymm12, ymm7
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vpxor ymm15, ymm15, ymm4
+ vperm2i128 ymm3, ymm12, ymm14, 32
+ vperm2i128 ymm7, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm8, ymm13, ymm15, 49
+ vpunpcklqdq ymm2, ymm3, ymm7
+ vpunpckhqdq ymm3, ymm3, ymm7
+ vpunpcklqdq ymm4, ymm6, ymm8
+ vpunpckhqdq ymm6, ymm6, ymm8
+ vpxor ymm0, ymm0, [rax]
+ add rax, 32
+ sub r12, 1
+ jnz L_sha3_block_n_avx2_rounds
+ sub rax, 768
+ add rdx, r9
+ sub r8d, 1
+ jnz L_sha3_block_n_avx2_start
+ vpermq ymm7, ymm2, 147
+ vpermq ymm8, ymm3, 78
+ vpermq ymm9, ymm4, 57
+ vpblendd ymm2, ymm7, ymm5, 3
+ vpblendd ymm3, ymm8, ymm7, 3
+ vpblendd ymm3, ymm3, ymm5, 12
+ vpblendd ymm4, ymm8, ymm9, 192
+ vpblendd ymm4, ymm4, ymm5, 48
+ vpblendd ymm5, ymm9, ymm5, 192
+ vmovq QWORD PTR [rcx], xmm0
+ vmovdqu YMMWORD PTR [rcx+8], ymm1
+ vmovdqu YMMWORD PTR [rcx+40], ymm2
+ vmovdqu YMMWORD PTR [rcx+72], ymm3
+ vmovdqu YMMWORD PTR [rcx+104], ymm4
+ vmovdqu YMMWORD PTR [rcx+136], ymm5
+ vmovdqu YMMWORD PTR [rcx+168], ymm6
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r12
+ ret
+sha3_block_n_avx2 ENDP
+_TEXT ENDS
+wc_masm_cond_0 = 0
+IFDEF WOLFSSL_HAVE_MLKEM
+wc_masm_cond_0 = 1
+ENDIF
+IFDEF WOLFSSL_HAVE_MLDSA
+wc_masm_cond_0 = 1
+ENDIF
+IFDEF WOLFSSL_HAVE_SLHDSA
+wc_masm_cond_0 = 1
+ENDIF
+IF wc_masm_cond_0
+_TEXT SEGMENT READONLY PARA
+sha3_blocksx4_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rdx, QWORD PTR [ptr_L_sha3_x4_avx2_r]
+ vmovdqu ymm15, YMMWORD PTR [rcx]
+ mov rax, rcx
+ mov r8, rcx
+ add rcx, 128
+ add rax, 384
+ add r8, 640
+ ; Round 0
+ ; Calc b[0..4]
+ vmovdqu ymm11, YMMWORD PTR [rcx+-96]
+ vmovdqu ymm12, YMMWORD PTR [rcx+-64]
+ vmovdqu ymm13, YMMWORD PTR [rcx+-32]
+ vmovdqu ymm14, YMMWORD PTR [rcx]
+ vpxor ymm10, ymm15, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm14, ymm14, [rax+-96]
+ vpxor ymm10, ymm10, [rax+-64]
+ vpxor ymm11, ymm11, [rax+-32]
+ vpxor ymm12, ymm12, [rax]
+ vpxor ymm13, ymm13, [rax+32]
+ vpxor ymm14, ymm14, [rax+64]
+ vpxor ymm10, ymm10, [rax+96]
+ vpxor ymm11, ymm11, [rax+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+64]
+ vpxor ymm12, ymm7, [rax]
+ vpxor ymm13, ymm8, [r8+-64]
+ vpxor ymm14, ymm9, [r8+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rax], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-32]
+ vpxor ymm11, ymm9, [rax+-96]
+ vpxor ymm12, ymm5, [rax+-64]
+ vpxor ymm13, ymm6, [rax+128]
+ vpxor ymm14, ymm7, [r8+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [rax+-96], ymm1
+ vmovdqu YMMWORD PTR [rax+-64], ymm2
+ vmovdqu YMMWORD PTR [rax+128], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-96]
+ vpxor ymm11, ymm7, [rcx+96]
+ vpxor ymm12, ymm8, [rax+32]
+ vpxor ymm13, ymm9, [r8+-32]
+ vpxor ymm14, ymm5, [r8]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [rax+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx]
+ vpxor ymm11, ymm5, [rcx+32]
+ vpxor ymm12, ymm6, [rax+-32]
+ vpxor ymm13, ymm7, [r8+-96]
+ vpxor ymm14, ymm8, [r8+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rax+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-64]
+ vpxor ymm11, ymm8, [rcx+128]
+ vpxor ymm12, ymm9, [rax+64]
+ vpxor ymm13, ymm5, [rax+96]
+ vpxor ymm14, ymm6, [r8+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rax+64], ymm2
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Round 1
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm11, ymm11, [rax+-96]
+ vpxor ymm12, ymm2, [rax+-64]
+ vpxor ymm12, ymm12, [rax+-32]
+ vpxor ymm12, ymm12, [rax]
+ vpxor ymm12, ymm12, [rax+32]
+ vpxor ymm13, ymm3, [rax+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm14, ymm4, [r8]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+-96]
+ vpxor ymm12, ymm7, [rax+32]
+ vpxor ymm13, ymm8, [r8+-96]
+ vpxor ymm14, ymm9, [r8+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+32]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-96], ymm1
+ vmovdqu YMMWORD PTR [rax+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-64]
+ vpxor ymm11, ymm9, [r8+64]
+ vpxor ymm12, ymm5, [rcx+-96]
+ vpxor ymm13, ymm6, [rcx+32]
+ vpxor ymm14, ymm7, [rax+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [rax+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+64]
+ vpxor ymm11, ymm7, [rax+-64]
+ vpxor ymm12, ymm8, [r8+-32]
+ vpxor ymm13, ymm9, [r8+96]
+ vpxor ymm14, ymm5, [rcx+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rax+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+128]
+ vpxor ymm11, ymm5, [rcx+-32]
+ vpxor ymm12, ymm6, [rcx+96]
+ vpxor ymm13, ymm7, [rax+-32]
+ vpxor ymm14, ymm8, [rax+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rax+-32], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax]
+ vpxor ymm11, ymm8, [rax+128]
+ vpxor ymm12, ymm9, [r8]
+ vpxor ymm13, ymm5, [rcx]
+ vpxor ymm14, ymm6, [rcx+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax], ymm0
+ vmovdqu YMMWORD PTR [rax+128], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Round 2
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm11, ymm11, [rax+-96]
+ vpxor ymm11, ymm11, [rax+-64]
+ vpxor ymm13, ymm13, [rax+-32]
+ vpxor ymm12, ymm12, [rax+32]
+ vpxor ymm14, ymm14, [rax+64]
+ vpxor ymm14, ymm14, [rax+96]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+64]
+ vpxor ymm12, ymm7, [r8+-32]
+ vpxor ymm13, ymm8, [rax+-32]
+ vpxor ymm14, ymm9, [rcx+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+64]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-96]
+ vpxor ymm11, ymm9, [rax+64]
+ vpxor ymm12, ymm5, [rcx+64]
+ vpxor ymm13, ymm6, [rcx+-32]
+ vpxor ymm14, ymm7, [r8]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rax+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+-96]
+ vpxor ymm11, ymm7, [rcx+-96]
+ vpxor ymm12, ymm8, [r8+96]
+ vpxor ymm13, ymm9, [rax+96]
+ vpxor ymm14, ymm5, [rax]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [rax], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+32]
+ vpxor ymm11, ymm5, [r8+-64]
+ vpxor ymm12, ymm6, [rax+-64]
+ vpxor ymm13, ymm7, [rcx+96]
+ vpxor ymm14, ymm8, [rcx]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rax+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+32]
+ vpxor ymm11, ymm8, [rcx+32]
+ vpxor ymm12, ymm9, [rcx+-64]
+ vpxor ymm13, ymm5, [r8+128]
+ vpxor ymm14, ymm6, [rax+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm4
+ ; Round 3
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm2, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm10, ymm10, [rax+-96]
+ vpxor ymm12, ymm12, [rax+-64]
+ vpxor ymm13, ymm13, [rax+-32]
+ vpxor ymm14, ymm14, [rax]
+ vpxor ymm11, ymm11, [rax+64]
+ vpxor ymm13, ymm13, [rax+96]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+64]
+ vpxor ymm12, ymm7, [r8+96]
+ vpxor ymm13, ymm8, [rcx+96]
+ vpxor ymm14, ymm9, [rax+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+96]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+-32]
+ vpxor ymm11, ymm9, [r8]
+ vpxor ymm12, ymm5, [rax+-96]
+ vpxor ymm13, ymm6, [r8+-64]
+ vpxor ymm14, ymm7, [rcx+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-32], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rax+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+64]
+ vpxor ymm11, ymm7, [rcx+64]
+ vpxor ymm12, ymm8, [rax+96]
+ vpxor ymm13, ymm9, [rcx]
+ vpxor ymm14, ymm5, [rax+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rax+96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+128]
+ vpxor ymm11, ymm5, [r8+-96]
+ vpxor ymm12, ymm6, [rcx+-96]
+ vpxor ymm13, ymm7, [rax+-64]
+ vpxor ymm14, ymm8, [r8+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rax+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-32]
+ vpxor ymm11, ymm8, [rcx+-32]
+ vpxor ymm12, ymm9, [rax]
+ vpxor ymm13, ymm5, [r8+32]
+ vpxor ymm14, ymm6, [rcx+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rax], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Round 4
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm1, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [rax+-96]
+ vpxor ymm13, ymm13, [rax+-64]
+ vpxor ymm10, ymm10, [rax+-32]
+ vpxor ymm14, ymm14, [rax+32]
+ vpxor ymm11, ymm11, [rax+64]
+ vpxor ymm12, ymm12, [rax+96]
+ vpxor ymm14, ymm14, [rax+128]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8]
+ vpxor ymm12, ymm7, [rax+96]
+ vpxor ymm13, ymm8, [rax+-64]
+ vpxor ymm14, ymm9, [rcx+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+128]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rax+96], ymm2
+ vmovdqu YMMWORD PTR [rax+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+96]
+ vpxor ymm11, ymm9, [rcx+-64]
+ vpxor ymm12, ymm5, [r8+64]
+ vpxor ymm13, ymm6, [r8+-96]
+ vpxor ymm14, ymm7, [rax]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rax], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+64]
+ vpxor ymm11, ymm7, [rax+-96]
+ vpxor ymm12, ymm8, [rcx]
+ vpxor ymm13, ymm9, [r8+128]
+ vpxor ymm14, ymm5, [r8+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+64], ymm0
+ vmovdqu YMMWORD PTR [rax+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+128]
+ vpxor ymm11, ymm5, [rax+-32]
+ vpxor ymm12, ymm6, [rcx+64]
+ vpxor ymm13, ymm7, [rcx+-96]
+ vpxor ymm14, ymm8, [r8+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+128], ymm0
+ vmovdqu YMMWORD PTR [rax+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+96]
+ vpxor ymm11, ymm8, [r8+-64]
+ vpxor ymm12, ymm9, [rax+32]
+ vpxor ymm13, ymm5, [rcx+128]
+ vpxor ymm14, ymm6, [rcx+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rax+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Round 5
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [rax+-96]
+ vpxor ymm13, ymm13, [rax+-64]
+ vpxor ymm11, ymm11, [rax+-32]
+ vpxor ymm14, ymm14, [rax]
+ vpxor ymm10, ymm10, [rax+64]
+ vpxor ymm12, ymm12, [rax+96]
+ vpxor ymm10, ymm10, [rax+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-64]
+ vpxor ymm12, ymm7, [rcx]
+ vpxor ymm13, ymm8, [rcx+-96]
+ vpxor ymm14, ymm9, [rcx+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+160]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+-64]
+ vpxor ymm11, ymm9, [rax]
+ vpxor ymm12, ymm5, [rax+64]
+ vpxor ymm13, ymm6, [rax+-32]
+ vpxor ymm14, ymm7, [rax+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-64], ymm0
+ vmovdqu YMMWORD PTR [rax], ymm1
+ vmovdqu YMMWORD PTR [rax+64], ymm2
+ vmovdqu YMMWORD PTR [rax+-32], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8]
+ vpxor ymm11, ymm7, [r8+64]
+ vpxor ymm12, ymm8, [r8+128]
+ vpxor ymm13, ymm9, [r8+32]
+ vpxor ymm14, ymm5, [r8+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+32]
+ vpxor ymm11, ymm5, [rcx+96]
+ vpxor ymm12, ymm6, [rax+-96]
+ vpxor ymm13, ymm7, [rcx+64]
+ vpxor ymm14, ymm8, [rcx+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [rax+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+96]
+ vpxor ymm11, ymm8, [r8+-96]
+ vpxor ymm12, ymm9, [r8+-32]
+ vpxor ymm13, ymm5, [rax+128]
+ vpxor ymm14, ymm6, [r8+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+96], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Round 6
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm12, ymm12, [rax+-96]
+ vpxor ymm10, ymm10, [rax+-64]
+ vpxor ymm13, ymm13, [rax+-32]
+ vpxor ymm11, ymm11, [rax]
+ vpxor ymm14, ymm14, [rax+32]
+ vpxor ymm12, ymm12, [rax+64]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax]
+ vpxor ymm12, ymm7, [r8+128]
+ vpxor ymm13, ymm8, [rcx+64]
+ vpxor ymm14, ymm9, [r8+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+192]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-96]
+ vpxor ymm11, ymm9, [rax+32]
+ vpxor ymm12, ymm5, [r8]
+ vpxor ymm13, ymm6, [rcx+96]
+ vpxor ymm14, ymm7, [r8+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [rax+32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-64]
+ vpxor ymm11, ymm7, [rax+64]
+ vpxor ymm12, ymm8, [r8+32]
+ vpxor ymm13, ymm9, [rcx+128]
+ vpxor ymm14, ymm5, [rax+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-32]
+ vpxor ymm11, ymm5, [rax+-64]
+ vpxor ymm12, ymm6, [r8+64]
+ vpxor ymm13, ymm7, [rax+-96]
+ vpxor ymm14, ymm8, [rax+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [rax+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [rax+-96], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx]
+ vpxor ymm11, ymm8, [rax+-32]
+ vpxor ymm12, ymm9, [r8+96]
+ vpxor ymm13, ymm5, [rcx+32]
+ vpxor ymm14, ymm6, [r8+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rax+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Round 7
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm13, ymm13, [rax+-96]
+ vpxor ymm11, ymm1, [rax+-64]
+ vpxor ymm11, ymm11, [rax]
+ vpxor ymm11, ymm11, [rax+32]
+ vpxor ymm11, ymm11, [rax+64]
+ vpxor ymm14, ymm4, [rax+96]
+ vpxor ymm14, ymm14, [rax+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm12, ymm2, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm12, ymm12, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+32]
+ vpxor ymm12, ymm7, [r8+32]
+ vpxor ymm13, ymm8, [rax+-96]
+ vpxor ymm14, ymm9, [r8+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+224]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+32], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rax+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+64]
+ vpxor ymm11, ymm9, [r8+-32]
+ vpxor ymm12, ymm5, [rcx+-64]
+ vpxor ymm13, ymm6, [rax+-64]
+ vpxor ymm14, ymm7, [r8+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rax+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax]
+ vpxor ymm11, ymm7, [r8]
+ vpxor ymm12, ymm8, [rcx+128]
+ vpxor ymm13, ymm9, [rax+128]
+ vpxor ymm14, ymm5, [rcx]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rax+128], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-64]
+ vpxor ymm11, ymm5, [rcx+-96]
+ vpxor ymm12, ymm6, [rax+64]
+ vpxor ymm13, ymm7, [r8+64]
+ vpxor ymm14, ymm8, [rcx+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rax+64], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+128]
+ vpxor ymm11, ymm8, [rcx+96]
+ vpxor ymm12, ymm9, [rax+96]
+ vpxor ymm13, ymm5, [rcx+-32]
+ vpxor ymm14, ymm6, [rax+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [rax+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rax+-32], ymm4
+ ; Round 8
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm13, ymm3, [rax+-96]
+ vpxor ymm13, ymm13, [rax+-64]
+ vpxor ymm10, ymm10, [rax]
+ vpxor ymm11, ymm11, [rax+32]
+ vpxor ymm12, ymm12, [rax+64]
+ vpxor ymm13, ymm13, [rax+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-32]
+ vpxor ymm12, ymm7, [rcx+128]
+ vpxor ymm13, ymm8, [r8+64]
+ vpxor ymm14, ymm9, [rax+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+256]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rax+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+-96]
+ vpxor ymm11, ymm9, [r8+96]
+ vpxor ymm12, ymm5, [rax]
+ vpxor ymm13, ymm6, [rcx+-96]
+ vpxor ymm14, ymm7, [rax+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rax], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+32]
+ vpxor ymm11, ymm7, [rcx+-64]
+ vpxor ymm12, ymm8, [rax+128]
+ vpxor ymm13, ymm9, [rcx+32]
+ vpxor ymm14, ymm5, [r8+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [rax+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-96]
+ vpxor ymm11, ymm5, [rcx+64]
+ vpxor ymm12, ymm6, [r8]
+ vpxor ymm13, ymm7, [rax+64]
+ vpxor ymm14, ymm8, [rcx+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+32]
+ vpxor ymm11, ymm8, [rax+-64]
+ vpxor ymm12, ymm9, [rcx]
+ vpxor ymm13, ymm5, [r8+-64]
+ vpxor ymm14, ymm6, [rcx+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rax+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Round 9
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm12, ymm2, [rcx+128]
+ vpxor ymm10, ymm10, [rax+-96]
+ vpxor ymm14, ymm14, [rax+-32]
+ vpxor ymm12, ymm12, [rax]
+ vpxor ymm10, ymm10, [rax+32]
+ vpxor ymm13, ymm13, [rax+64]
+ vpxor ymm14, ymm14, [rax+96]
+ vpxor ymm12, ymm12, [rax+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+96]
+ vpxor ymm12, ymm7, [rax+128]
+ vpxor ymm13, ymm8, [rax+64]
+ vpxor ymm14, ymm9, [rcx+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+288]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rax+128], ymm2
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+64]
+ vpxor ymm11, ymm9, [rax+96]
+ vpxor ymm12, ymm5, [rax+32]
+ vpxor ymm13, ymm6, [rcx+64]
+ vpxor ymm14, ymm7, [rcx]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rax+96], ymm1
+ vmovdqu YMMWORD PTR [rax+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-32]
+ vpxor ymm11, ymm7, [rax]
+ vpxor ymm12, ymm8, [rcx+32]
+ vpxor ymm13, ymm9, [rcx+-32]
+ vpxor ymm14, ymm5, [r8+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rax], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+-32]
+ vpxor ymm11, ymm5, [rax+-96]
+ vpxor ymm12, ymm6, [rcx+-64]
+ vpxor ymm13, ymm7, [r8]
+ vpxor ymm14, ymm8, [r8+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-32], ymm0
+ vmovdqu YMMWORD PTR [rax+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+128]
+ vpxor ymm11, ymm8, [rcx+-96]
+ vpxor ymm12, ymm9, [r8+128]
+ vpxor ymm13, ymm5, [r8+-96]
+ vpxor ymm14, ymm6, [rax+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rax+-64], ymm4
+ ; Round 10
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm1, [rax+-96]
+ vpxor ymm10, ymm10, [rax+-32]
+ vpxor ymm11, ymm11, [rax]
+ vpxor ymm12, ymm12, [rax+32]
+ vpxor ymm13, ymm13, [rax+64]
+ vpxor ymm11, ymm11, [rax+96]
+ vpxor ymm12, ymm12, [rax+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+96]
+ vpxor ymm12, ymm7, [rcx+32]
+ vpxor ymm13, ymm8, [r8]
+ vpxor ymm14, ymm9, [rax+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+320]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rax+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+64]
+ vpxor ymm11, ymm9, [rcx]
+ vpxor ymm12, ymm5, [r8+-32]
+ vpxor ymm13, ymm6, [rax+-96]
+ vpxor ymm14, ymm7, [r8+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+96]
+ vpxor ymm11, ymm7, [rax+32]
+ vpxor ymm12, ymm8, [rcx+-32]
+ vpxor ymm13, ymm9, [r8+-64]
+ vpxor ymm14, ymm5, [rcx+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [rax+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+96]
+ vpxor ymm11, ymm5, [r8+64]
+ vpxor ymm12, ymm6, [rax]
+ vpxor ymm13, ymm7, [rcx+-64]
+ vpxor ymm14, ymm8, [r8+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rax], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+128]
+ vpxor ymm11, ymm8, [rcx+64]
+ vpxor ymm12, ymm9, [r8+32]
+ vpxor ymm13, ymm5, [rax+-32]
+ vpxor ymm14, ymm6, [rcx+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rax+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Round 11
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm14, ymm4, [rcx+128]
+ vpxor ymm13, ymm13, [rax+-96]
+ vpxor ymm14, ymm14, [rax+-64]
+ vpxor ymm12, ymm12, [rax]
+ vpxor ymm11, ymm11, [rax+32]
+ vpxor ymm10, ymm10, [rax+64]
+ vpxor ymm11, ymm11, [rax+96]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx]
+ vpxor ymm12, ymm7, [rcx+-32]
+ vpxor ymm13, ymm8, [rcx+-64]
+ vpxor ymm14, ymm9, [rcx+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+352]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8]
+ vpxor ymm11, ymm9, [r8+128]
+ vpxor ymm12, ymm5, [r8+96]
+ vpxor ymm13, ymm6, [r8+64]
+ vpxor ymm14, ymm7, [r8+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+96]
+ vpxor ymm11, ymm7, [r8+-32]
+ vpxor ymm12, ymm8, [r8+-64]
+ vpxor ymm13, ymm9, [r8+-96]
+ vpxor ymm14, ymm5, [rax+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+96], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+-64]
+ vpxor ymm11, ymm5, [rax+64]
+ vpxor ymm12, ymm6, [rax+32]
+ vpxor ymm13, ymm7, [rax]
+ vpxor ymm14, ymm8, [rax+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+64], ymm1
+ vmovdqu YMMWORD PTR [rax+32], ymm2
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+32]
+ vpxor ymm11, ymm8, [rax+-96]
+ vpxor ymm12, ymm9, [rcx+128]
+ vpxor ymm13, ymm5, [rcx+96]
+ vpxor ymm14, ymm6, [rcx+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rax+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Round 12
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm10, ymm10, [rax+-64]
+ vpxor ymm14, ymm14, [rax+-32]
+ vpxor ymm13, ymm13, [rax]
+ vpxor ymm12, ymm12, [rax+32]
+ vpxor ymm11, ymm11, [rax+64]
+ vpxor ymm10, ymm10, [rax+96]
+ vpxor ymm14, ymm14, [rax+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+128]
+ vpxor ymm12, ymm7, [r8+-64]
+ vpxor ymm13, ymm8, [rax]
+ vpxor ymm14, ymm9, [rcx+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+384]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-64]
+ vpxor ymm11, ymm9, [r8+32]
+ vpxor ymm12, ymm5, [rax+96]
+ vpxor ymm13, ymm6, [rax+64]
+ vpxor ymm14, ymm7, [rcx+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rax+96], ymm2
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx]
+ vpxor ymm11, ymm7, [r8+96]
+ vpxor ymm12, ymm8, [r8+-96]
+ vpxor ymm13, ymm9, [rax+-32]
+ vpxor ymm14, ymm5, [rcx+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [rax+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-96]
+ vpxor ymm11, ymm5, [r8]
+ vpxor ymm12, ymm6, [r8+-32]
+ vpxor ymm13, ymm7, [rax+32]
+ vpxor ymm14, ymm8, [rcx+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-32]
+ vpxor ymm11, ymm8, [r8+64]
+ vpxor ymm12, ymm9, [rax+128]
+ vpxor ymm13, ymm5, [rax+-64]
+ vpxor ymm14, ymm6, [rax+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rax+128], ymm2
+ vmovdqu YMMWORD PTR [rax+-64], ymm3
+ vmovdqu YMMWORD PTR [rax+-96], ymm4
+ ; Round 13
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm13, ymm3, [rax+-32]
+ vpxor ymm13, ymm13, [rax]
+ vpxor ymm13, ymm13, [rax+32]
+ vpxor ymm13, ymm13, [rax+64]
+ vpxor ymm12, ymm2, [rax+96]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm11, ymm1, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+32]
+ vpxor ymm12, ymm7, [r8+-96]
+ vpxor ymm13, ymm8, [rax+32]
+ vpxor ymm14, ymm9, [rax+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+416]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [rax+32], ymm3
+ vmovdqu YMMWORD PTR [rax+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax]
+ vpxor ymm11, ymm9, [rcx+128]
+ vpxor ymm12, ymm5, [rcx]
+ vpxor ymm13, ymm6, [r8]
+ vpxor ymm14, ymm7, [rax+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+128]
+ vpxor ymm11, ymm7, [rax+96]
+ vpxor ymm12, ymm8, [rax+-32]
+ vpxor ymm13, ymm9, [rcx+96]
+ vpxor ymm14, ymm5, [rcx+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rax+96], ymm1
+ vmovdqu YMMWORD PTR [rax+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+64]
+ vpxor ymm11, ymm5, [rcx+-64]
+ vpxor ymm12, ymm6, [r8+96]
+ vpxor ymm13, ymm7, [r8+-32]
+ vpxor ymm14, ymm8, [rax+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rax+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-64]
+ vpxor ymm11, ymm8, [rax+64]
+ vpxor ymm12, ymm9, [rcx+32]
+ vpxor ymm13, ymm5, [rcx+-96]
+ vpxor ymm14, ymm6, [r8+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Round 14
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm13, ymm3, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm14, ymm14, [rax+-96]
+ vpxor ymm14, ymm14, [rax+-64]
+ vpxor ymm12, ymm12, [rax+-32]
+ vpxor ymm10, ymm10, [rax]
+ vpxor ymm13, ymm13, [rax+32]
+ vpxor ymm11, ymm11, [rax+96]
+ vpxor ymm14, ymm14, [rax+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+128]
+ vpxor ymm12, ymm7, [rax+-32]
+ vpxor ymm13, ymm8, [r8+-32]
+ vpxor ymm14, ymm9, [r8+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+448]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rax+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+32]
+ vpxor ymm11, ymm9, [rax+128]
+ vpxor ymm12, ymm5, [r8+128]
+ vpxor ymm13, ymm6, [rcx+-64]
+ vpxor ymm14, ymm7, [rcx+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+32], ymm0
+ vmovdqu YMMWORD PTR [rax+128], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+32]
+ vpxor ymm11, ymm7, [rcx]
+ vpxor ymm12, ymm8, [rcx+96]
+ vpxor ymm13, ymm9, [rax+-64]
+ vpxor ymm14, ymm5, [r8+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rax+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+-96]
+ vpxor ymm11, ymm5, [rax]
+ vpxor ymm12, ymm6, [rax+96]
+ vpxor ymm13, ymm7, [r8+96]
+ vpxor ymm14, ymm8, [rcx+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-96], ymm0
+ vmovdqu YMMWORD PTR [rax], ymm1
+ vmovdqu YMMWORD PTR [rax+96], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-96]
+ vpxor ymm11, ymm8, [r8]
+ vpxor ymm12, ymm9, [rcx+-32]
+ vpxor ymm13, ymm5, [rcx+64]
+ vpxor ymm14, ymm6, [rax+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rax+64], ymm4
+ ; Round 15
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm12, ymm2, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm10, ymm10, [rax+-96]
+ vpxor ymm13, ymm13, [rax+-64]
+ vpxor ymm12, ymm12, [rax+-32]
+ vpxor ymm11, ymm11, [rax]
+ vpxor ymm10, ymm10, [rax+32]
+ vpxor ymm12, ymm12, [rax+96]
+ vpxor ymm11, ymm11, [rax+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+128]
+ vpxor ymm12, ymm7, [rcx+96]
+ vpxor ymm13, ymm8, [r8+96]
+ vpxor ymm14, ymm9, [rax+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+480]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rax+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-32]
+ vpxor ymm11, ymm9, [rcx+32]
+ vpxor ymm12, ymm5, [r8+32]
+ vpxor ymm13, ymm6, [rax]
+ vpxor ymm14, ymm7, [rcx+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+128]
+ vpxor ymm11, ymm7, [r8+128]
+ vpxor ymm12, ymm8, [rax+-64]
+ vpxor ymm13, ymm9, [rcx+-96]
+ vpxor ymm14, ymm5, [r8+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rax+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+64]
+ vpxor ymm11, ymm5, [rax+32]
+ vpxor ymm12, ymm6, [rcx]
+ vpxor ymm13, ymm7, [rax+96]
+ vpxor ymm14, ymm8, [rcx+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rax+32], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+-32]
+ vpxor ymm11, ymm8, [rcx+-64]
+ vpxor ymm12, ymm9, [r8+-64]
+ vpxor ymm13, ymm5, [rax+-96]
+ vpxor ymm14, ymm6, [r8]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rax+-96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Round 16
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [rax+-64]
+ vpxor ymm13, ymm13, [rax]
+ vpxor ymm11, ymm11, [rax+32]
+ vpxor ymm14, ymm14, [rax+64]
+ vpxor ymm13, ymm13, [rax+96]
+ vpxor ymm11, ymm11, [rax+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+32]
+ vpxor ymm12, ymm7, [rax+-64]
+ vpxor ymm13, ymm8, [rax+96]
+ vpxor ymm14, ymm9, [r8]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+512]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rax+-64], ymm2
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+96]
+ vpxor ymm11, ymm9, [rcx+-32]
+ vpxor ymm12, ymm5, [rcx+128]
+ vpxor ymm13, ymm6, [rax+32]
+ vpxor ymm14, ymm7, [r8+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rax+32], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+128]
+ vpxor ymm11, ymm7, [r8+32]
+ vpxor ymm12, ymm8, [rcx+-96]
+ vpxor ymm13, ymm9, [rcx+64]
+ vpxor ymm14, ymm5, [rax+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+128], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rax+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+64]
+ vpxor ymm11, ymm5, [r8+-32]
+ vpxor ymm12, ymm6, [r8+128]
+ vpxor ymm13, ymm7, [rcx]
+ vpxor ymm14, ymm8, [rax+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rax+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+96]
+ vpxor ymm11, ymm8, [rax]
+ vpxor ymm12, ymm9, [r8+-96]
+ vpxor ymm13, ymm5, [r8+64]
+ vpxor ymm14, ymm6, [rcx+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rax], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Round 17
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm14, ymm4, [rax+-96]
+ vpxor ymm12, ymm12, [rax+-64]
+ vpxor ymm14, ymm14, [rax+-32]
+ vpxor ymm13, ymm13, [rax+32]
+ vpxor ymm10, ymm10, [rax+64]
+ vpxor ymm13, ymm13, [rax+96]
+ vpxor ymm10, ymm10, [rax+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-32]
+ vpxor ymm12, ymm7, [rcx+-96]
+ vpxor ymm13, ymm8, [rcx]
+ vpxor ymm14, ymm9, [rcx+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+544]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+96]
+ vpxor ymm11, ymm9, [r8+-64]
+ vpxor ymm12, ymm5, [rax+128]
+ vpxor ymm13, ymm6, [r8+-32]
+ vpxor ymm14, ymm7, [r8+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+96], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rax+128], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+32]
+ vpxor ymm11, ymm7, [rcx+128]
+ vpxor ymm12, ymm8, [rcx+64]
+ vpxor ymm13, ymm9, [rax+-96]
+ vpxor ymm14, ymm5, [rcx+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rax+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8]
+ vpxor ymm11, ymm5, [r8+96]
+ vpxor ymm12, ymm6, [r8+32]
+ vpxor ymm13, ymm7, [r8+128]
+ vpxor ymm14, ymm8, [r8+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+-64]
+ vpxor ymm11, ymm8, [rax+32]
+ vpxor ymm12, ymm9, [rax+-32]
+ vpxor ymm13, ymm5, [rax+64]
+ vpxor ymm14, ymm6, [rax]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+32], ymm1
+ vmovdqu YMMWORD PTR [rax+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rax], ymm4
+ ; Round 18
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm13, ymm13, [rax+-96]
+ vpxor ymm10, ymm10, [rax+96]
+ vpxor ymm12, ymm12, [rax+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm13, ymm13, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-64]
+ vpxor ymm12, ymm7, [rcx+64]
+ vpxor ymm13, ymm8, [r8+128]
+ vpxor ymm14, ymm9, [rax]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+576]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rax], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx]
+ vpxor ymm11, ymm9, [r8+-96]
+ vpxor ymm12, ymm5, [rcx+32]
+ vpxor ymm13, ymm6, [r8+96]
+ vpxor ymm14, ymm7, [rax+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rax+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-32]
+ vpxor ymm11, ymm7, [rax+128]
+ vpxor ymm12, ymm8, [rax+-96]
+ vpxor ymm13, ymm9, [r8+64]
+ vpxor ymm14, ymm5, [rax+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [rax+128], ymm1
+ vmovdqu YMMWORD PTR [rax+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rax+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-64]
+ vpxor ymm11, ymm5, [rax+96]
+ vpxor ymm12, ymm6, [rcx+128]
+ vpxor ymm13, ymm7, [r8+32]
+ vpxor ymm14, ymm8, [rax+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rax+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-96]
+ vpxor ymm11, ymm8, [r8+-32]
+ vpxor ymm12, ymm9, [rcx+96]
+ vpxor ymm13, ymm5, [r8]
+ vpxor ymm14, ymm6, [rax+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm4
+ ; Round 19
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm12, ymm12, [rax+-96]
+ vpxor ymm14, ymm4, [rax+-64]
+ vpxor ymm14, ymm14, [rax+-32]
+ vpxor ymm14, ymm14, [rax]
+ vpxor ymm14, ymm14, [rax+64]
+ vpxor ymm11, ymm1, [rax+96]
+ vpxor ymm11, ymm11, [rax+128]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm13, ymm3, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm13, ymm13, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-96]
+ vpxor ymm12, ymm7, [rax+-96]
+ vpxor ymm13, ymm8, [r8+32]
+ vpxor ymm14, ymm9, [rax+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+608]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rax+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+128]
+ vpxor ymm11, ymm9, [rax+-32]
+ vpxor ymm12, ymm5, [rcx+-32]
+ vpxor ymm13, ymm6, [rax+96]
+ vpxor ymm14, ymm7, [rcx+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rax+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-64]
+ vpxor ymm11, ymm7, [rcx+32]
+ vpxor ymm12, ymm8, [r8+64]
+ vpxor ymm13, ymm9, [rax+64]
+ vpxor ymm14, ymm5, [rcx+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax]
+ vpxor ymm11, ymm5, [rcx]
+ vpxor ymm12, ymm6, [rax+128]
+ vpxor ymm13, ymm7, [rcx+128]
+ vpxor ymm14, ymm8, [r8]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rax+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+64]
+ vpxor ymm11, ymm8, [r8+96]
+ vpxor ymm12, ymm9, [rax+-64]
+ vpxor ymm13, ymm5, [rcx+-64]
+ vpxor ymm14, ymm6, [r8+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rax+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Round 20
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm13, ymm3, [rcx+128]
+ vpxor ymm12, ymm12, [rax+-96]
+ vpxor ymm11, ymm11, [rax+-32]
+ vpxor ymm10, ymm10, [rax]
+ vpxor ymm14, ymm14, [rax+32]
+ vpxor ymm13, ymm13, [rax+64]
+ vpxor ymm13, ymm13, [rax+96]
+ vpxor ymm12, ymm12, [rax+128]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm10, ymm10, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+-32]
+ vpxor ymm12, ymm7, [r8+64]
+ vpxor ymm13, ymm8, [rcx+128]
+ vpxor ymm14, ymm9, [r8+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+640]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+32]
+ vpxor ymm11, ymm9, [rcx+96]
+ vpxor ymm12, ymm5, [r8+-64]
+ vpxor ymm13, ymm6, [rcx]
+ vpxor ymm14, ymm7, [rax+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rax+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-96]
+ vpxor ymm11, ymm7, [rcx+-32]
+ vpxor ymm12, ymm8, [rax+64]
+ vpxor ymm13, ymm9, [r8]
+ vpxor ymm14, ymm5, [rcx+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rax+64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+32]
+ vpxor ymm11, ymm5, [r8+128]
+ vpxor ymm12, ymm6, [rcx+32]
+ vpxor ymm13, ymm7, [rax+128]
+ vpxor ymm14, ymm8, [rcx+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rax+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+-96]
+ vpxor ymm11, ymm8, [rax+96]
+ vpxor ymm12, ymm9, [rcx+-96]
+ vpxor ymm13, ymm5, [rax]
+ vpxor ymm14, ymm6, [r8+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-96], ymm0
+ vmovdqu YMMWORD PTR [rax+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Round 21
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm14, ymm14, [rax+-64]
+ vpxor ymm11, ymm11, [rax+-32]
+ vpxor ymm10, ymm10, [rax+32]
+ vpxor ymm12, ymm12, [rax+64]
+ vpxor ymm13, ymm13, [rax+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm11, ymm11, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+96]
+ vpxor ymm12, ymm7, [rax+64]
+ vpxor ymm13, ymm8, [rax+128]
+ vpxor ymm14, ymm9, [r8+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+672]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [rax+64], ymm2
+ vmovdqu YMMWORD PTR [rax+128], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+128]
+ vpxor ymm11, ymm9, [rax+-64]
+ vpxor ymm12, ymm5, [r8+-96]
+ vpxor ymm13, ymm6, [r8+128]
+ vpxor ymm14, ymm7, [rcx+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rax+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+-32]
+ vpxor ymm11, ymm7, [r8+-64]
+ vpxor ymm12, ymm8, [r8]
+ vpxor ymm13, ymm9, [rcx+-64]
+ vpxor ymm14, ymm5, [rax+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rax+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-32]
+ vpxor ymm11, ymm5, [r8+32]
+ vpxor ymm12, ymm6, [rcx+-32]
+ vpxor ymm13, ymm7, [rcx+32]
+ vpxor ymm14, ymm8, [rax]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [rax], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+64]
+ vpxor ymm11, ymm8, [rcx]
+ vpxor ymm12, ymm9, [rcx+64]
+ vpxor ymm13, ymm5, [rax+32]
+ vpxor ymm14, ymm6, [rax+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rax+32], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm4
+ ; Round 22
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm1, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm14, ymm14, [rax+-96]
+ vpxor ymm11, ymm11, [rax+-64]
+ vpxor ymm10, ymm10, [rax+-32]
+ vpxor ymm14, ymm14, [rax]
+ vpxor ymm12, ymm12, [rax+64]
+ vpxor ymm13, ymm13, [rax+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm13, ymm13, [r8+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rax+-64]
+ vpxor ymm12, ymm7, [r8]
+ vpxor ymm13, ymm8, [rcx+32]
+ vpxor ymm14, ymm9, [rax+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+704]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rax+128]
+ vpxor ymm11, ymm9, [rcx+-96]
+ vpxor ymm12, ymm5, [rax+-32]
+ vpxor ymm13, ymm6, [r8+32]
+ vpxor ymm14, ymm7, [rcx+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rax+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+96]
+ vpxor ymm11, ymm7, [r8+-96]
+ vpxor ymm12, ymm8, [rcx+-64]
+ vpxor ymm13, ymm9, [rax]
+ vpxor ymm14, ymm5, [r8+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+96]
+ vpxor ymm11, ymm5, [rcx+128]
+ vpxor ymm12, ymm6, [r8+-64]
+ vpxor ymm13, ymm7, [rcx+-32]
+ vpxor ymm14, ymm8, [rax+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rax+64]
+ vpxor ymm11, ymm8, [r8+128]
+ vpxor ymm12, ymm9, [rax+-96]
+ vpxor ymm13, ymm5, [r8+-32]
+ vpxor ymm14, ymm6, [rcx]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+64], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rax+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Round 23
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm14, ymm4, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm11, ymm11, [rax+-64]
+ vpxor ymm12, ymm12, [rax+-32]
+ vpxor ymm13, ymm13, [rax]
+ vpxor ymm14, ymm14, [rax+32]
+ vpxor ymm14, ymm14, [rax+96]
+ vpxor ymm10, ymm10, [rax+128]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-96]
+ vpxor ymm12, ymm7, [rcx+-64]
+ vpxor ymm13, ymm8, [rcx+-32]
+ vpxor ymm14, ymm9, [rcx]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rdx+736]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+32]
+ vpxor ymm11, ymm9, [rcx+64]
+ vpxor ymm12, ymm5, [rcx+96]
+ vpxor ymm13, ymm6, [rcx+128]
+ vpxor ymm14, ymm7, [rax+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rax+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rax+-64]
+ vpxor ymm11, ymm7, [rax+-32]
+ vpxor ymm12, ymm8, [rax]
+ vpxor ymm13, ymm9, [rax+32]
+ vpxor ymm14, ymm5, [rax+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+-64], ymm0
+ vmovdqu YMMWORD PTR [rax+-32], ymm1
+ vmovdqu YMMWORD PTR [rax], ymm2
+ vmovdqu YMMWORD PTR [rax+32], ymm3
+ vmovdqu YMMWORD PTR [rax+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rax+96]
+ vpxor ymm11, ymm5, [rax+128]
+ vpxor ymm12, ymm6, [r8+-96]
+ vpxor ymm13, ymm7, [r8+-64]
+ vpxor ymm14, ymm8, [r8+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rax+96], ymm0
+ vmovdqu YMMWORD PTR [rax+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8]
+ vpxor ymm11, ymm8, [r8+32]
+ vpxor ymm12, ymm9, [r8+64]
+ vpxor ymm13, ymm5, [r8+96]
+ vpxor ymm14, ymm6, [r8+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ sub rcx, 128
+ vmovdqu YMMWORD PTR [rcx], ymm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+sha3_blocksx4_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_sha3_128_blockx4_seed_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h
+ QWORD 8000000000000000h, 8000000000000000h
+ptr_L_sha3_128_blockx4_seed_avx2_end_mark QWORD L_sha3_128_blockx4_seed_avx2_end_mark
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_128_blocksx4_seed_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r]
+ mov r8, rcx
+ mov r9, rcx
+ vpbroadcastq ymm15, QWORD PTR [rdx]
+ add rcx, 128
+ vpbroadcastq ymm11, QWORD PTR [rdx+8]
+ add r8, 384
+ vpbroadcastq ymm12, QWORD PTR [rdx+16]
+ add r9, 640
+ vpbroadcastq ymm13, QWORD PTR [rdx+24]
+ vmovdqu ymm5, YMMWORD PTR L_sha3_128_blockx4_seed_avx2_end_mark
+ vpxor ymm6, ymm6, ymm6
+ vmovdqu YMMWORD PTR [rcx+-96], ymm11
+ vmovdqu YMMWORD PTR [rcx+-64], ymm12
+ vmovdqu YMMWORD PTR [rcx+-32], ymm13
+ vmovdqu ymm14, YMMWORD PTR [rcx]
+ vmovdqu YMMWORD PTR [rcx+32], ymm6
+ vmovdqu YMMWORD PTR [rcx+64], ymm6
+ vmovdqu YMMWORD PTR [rcx+96], ymm6
+ vmovdqu YMMWORD PTR [rcx+128], ymm6
+ vmovdqu YMMWORD PTR [r8+-96], ymm6
+ vmovdqu YMMWORD PTR [r8+-64], ymm6
+ vmovdqu YMMWORD PTR [r8+-32], ymm6
+ vmovdqu YMMWORD PTR [r8], ymm6
+ vmovdqu YMMWORD PTR [r8+32], ymm6
+ vmovdqu YMMWORD PTR [r8+64], ymm6
+ vmovdqu YMMWORD PTR [r8+96], ymm6
+ vmovdqu YMMWORD PTR [r8+128], ymm6
+ vmovdqu YMMWORD PTR [r9+-96], ymm6
+ vmovdqu YMMWORD PTR [r9+-64], ymm6
+ vmovdqu YMMWORD PTR [r9+-32], ymm6
+ vmovdqu YMMWORD PTR [r9], ymm5
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [r9+64], ymm6
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ vmovdqu YMMWORD PTR [r9+128], ymm6
+ vpxor ymm10, ymm15, ymm5
+ ; Round 0
+ ; Calc b[0..4]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+64]
+ vpxor ymm12, ymm7, [r8]
+ vpxor ymm13, ymm8, [r9+-64]
+ vpxor ymm14, ymm9, [r9+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-32]
+ vpxor ymm11, ymm9, [r8+-96]
+ vpxor ymm12, ymm5, [r8+-64]
+ vpxor ymm13, ymm6, [r8+128]
+ vpxor ymm14, ymm7, [r9+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-96]
+ vpxor ymm11, ymm7, [rcx+96]
+ vpxor ymm12, ymm8, [r8+32]
+ vpxor ymm13, ymm9, [r9+-32]
+ vpxor ymm14, ymm5, [r9]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx]
+ vpxor ymm11, ymm5, [rcx+32]
+ vpxor ymm12, ymm6, [r8+-32]
+ vpxor ymm13, ymm7, [r9+-96]
+ vpxor ymm14, ymm8, [r9+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-64]
+ vpxor ymm11, ymm8, [rcx+128]
+ vpxor ymm12, ymm9, [r8+64]
+ vpxor ymm13, ymm5, [r8+96]
+ vpxor ymm14, ymm6, [r9+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Round 1
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm12, ymm2, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm3, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm14, ymm4, [r9]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-96]
+ vpxor ymm12, ymm7, [r8+32]
+ vpxor ymm13, ymm8, [r9+-96]
+ vpxor ymm14, ymm9, [r9+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+32]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-64]
+ vpxor ymm11, ymm9, [r9+64]
+ vpxor ymm12, ymm5, [rcx+-96]
+ vpxor ymm13, ymm6, [rcx+32]
+ vpxor ymm14, ymm7, [r8+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+64]
+ vpxor ymm11, ymm7, [r8+-64]
+ vpxor ymm12, ymm8, [r9+-32]
+ vpxor ymm13, ymm9, [r9+96]
+ vpxor ymm14, ymm5, [rcx+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+128]
+ vpxor ymm11, ymm5, [rcx+-32]
+ vpxor ymm12, ymm6, [rcx+96]
+ vpxor ymm13, ymm7, [r8+-32]
+ vpxor ymm14, ymm8, [r8+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8]
+ vpxor ymm11, ymm8, [r8+128]
+ vpxor ymm12, ymm9, [r9]
+ vpxor ymm13, ymm5, [rcx]
+ vpxor ymm14, ymm6, [rcx+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Round 2
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+64]
+ vpxor ymm12, ymm7, [r9+-32]
+ vpxor ymm13, ymm8, [r8+-32]
+ vpxor ymm14, ymm9, [rcx+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+64]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-96]
+ vpxor ymm11, ymm9, [r8+64]
+ vpxor ymm12, ymm5, [rcx+64]
+ vpxor ymm13, ymm6, [rcx+-32]
+ vpxor ymm14, ymm7, [r9]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-96]
+ vpxor ymm11, ymm7, [rcx+-96]
+ vpxor ymm12, ymm8, [r9+96]
+ vpxor ymm13, ymm9, [r8+96]
+ vpxor ymm14, ymm5, [r8]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+32]
+ vpxor ymm11, ymm5, [r9+-64]
+ vpxor ymm12, ymm6, [r8+-64]
+ vpxor ymm13, ymm7, [rcx+96]
+ vpxor ymm14, ymm8, [rcx]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+32]
+ vpxor ymm11, ymm8, [rcx+32]
+ vpxor ymm12, ymm9, [rcx+-64]
+ vpxor ymm13, ymm5, [r9+128]
+ vpxor ymm14, ymm6, [r8+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Round 3
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm2, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+64]
+ vpxor ymm12, ymm7, [r9+96]
+ vpxor ymm13, ymm8, [rcx+96]
+ vpxor ymm14, ymm9, [r8+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+96]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-32]
+ vpxor ymm11, ymm9, [r9]
+ vpxor ymm12, ymm5, [r8+-96]
+ vpxor ymm13, ymm6, [r9+-64]
+ vpxor ymm14, ymm7, [rcx+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+64]
+ vpxor ymm11, ymm7, [rcx+64]
+ vpxor ymm12, ymm8, [r8+96]
+ vpxor ymm13, ymm9, [rcx]
+ vpxor ymm14, ymm5, [r8+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+128]
+ vpxor ymm11, ymm5, [r9+-96]
+ vpxor ymm12, ymm6, [rcx+-96]
+ vpxor ymm13, ymm7, [r8+-64]
+ vpxor ymm14, ymm8, [r9+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-32]
+ vpxor ymm11, ymm8, [rcx+-32]
+ vpxor ymm12, ymm9, [r8]
+ vpxor ymm13, ymm5, [r9+32]
+ vpxor ymm14, ymm6, [rcx+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Round 4
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm1, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9]
+ vpxor ymm12, ymm7, [r8+96]
+ vpxor ymm13, ymm8, [r8+-64]
+ vpxor ymm14, ymm9, [rcx+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+128]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+96]
+ vpxor ymm11, ymm9, [rcx+-64]
+ vpxor ymm12, ymm5, [r9+64]
+ vpxor ymm13, ymm6, [r9+-96]
+ vpxor ymm14, ymm7, [r8]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+64]
+ vpxor ymm11, ymm7, [r8+-96]
+ vpxor ymm12, ymm8, [rcx]
+ vpxor ymm13, ymm9, [r9+128]
+ vpxor ymm14, ymm5, [r9+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+128]
+ vpxor ymm11, ymm5, [r8+-32]
+ vpxor ymm12, ymm6, [rcx+64]
+ vpxor ymm13, ymm7, [rcx+-96]
+ vpxor ymm14, ymm8, [r9+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+96]
+ vpxor ymm11, ymm8, [r9+-64]
+ vpxor ymm12, ymm9, [r8+32]
+ vpxor ymm13, ymm5, [rcx+128]
+ vpxor ymm14, ymm6, [rcx+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Round 5
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-64]
+ vpxor ymm12, ymm7, [rcx]
+ vpxor ymm13, ymm8, [rcx+-96]
+ vpxor ymm14, ymm9, [rcx+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+160]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-64]
+ vpxor ymm11, ymm9, [r8]
+ vpxor ymm12, ymm5, [r8+64]
+ vpxor ymm13, ymm6, [r8+-32]
+ vpxor ymm14, ymm7, [r8+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9]
+ vpxor ymm11, ymm7, [r9+64]
+ vpxor ymm12, ymm8, [r9+128]
+ vpxor ymm13, ymm9, [r9+32]
+ vpxor ymm14, ymm5, [r9+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+32]
+ vpxor ymm11, ymm5, [rcx+96]
+ vpxor ymm12, ymm6, [r8+-96]
+ vpxor ymm13, ymm7, [rcx+64]
+ vpxor ymm14, ymm8, [rcx+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+96]
+ vpxor ymm11, ymm8, [r9+-96]
+ vpxor ymm12, ymm9, [r9+-32]
+ vpxor ymm13, ymm5, [r8+128]
+ vpxor ymm14, ymm6, [r9+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Round 6
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8]
+ vpxor ymm12, ymm7, [r9+128]
+ vpxor ymm13, ymm8, [rcx+64]
+ vpxor ymm14, ymm9, [r9+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+192]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-96]
+ vpxor ymm11, ymm9, [r8+32]
+ vpxor ymm12, ymm5, [r9]
+ vpxor ymm13, ymm6, [rcx+96]
+ vpxor ymm14, ymm7, [r9+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-64]
+ vpxor ymm11, ymm7, [r8+64]
+ vpxor ymm12, ymm8, [r9+32]
+ vpxor ymm13, ymm9, [rcx+128]
+ vpxor ymm14, ymm5, [r8+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-32]
+ vpxor ymm11, ymm5, [r8+-64]
+ vpxor ymm12, ymm6, [r9+64]
+ vpxor ymm13, ymm7, [r8+-96]
+ vpxor ymm14, ymm8, [r8+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx]
+ vpxor ymm11, ymm8, [r8+-32]
+ vpxor ymm12, ymm9, [r9+96]
+ vpxor ymm13, ymm5, [rcx+32]
+ vpxor ymm14, ymm6, [r9+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Round 7
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm11, ymm1, [r8+-64]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm14, ymm4, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm12, ymm2, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+32]
+ vpxor ymm12, ymm7, [r9+32]
+ vpxor ymm13, ymm8, [r8+-96]
+ vpxor ymm14, ymm9, [r9+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+224]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+64]
+ vpxor ymm11, ymm9, [r9+-32]
+ vpxor ymm12, ymm5, [rcx+-64]
+ vpxor ymm13, ymm6, [r8+-64]
+ vpxor ymm14, ymm7, [r9+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8]
+ vpxor ymm11, ymm7, [r9]
+ vpxor ymm12, ymm8, [rcx+128]
+ vpxor ymm13, ymm9, [r8+128]
+ vpxor ymm14, ymm5, [rcx]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-64]
+ vpxor ymm11, ymm5, [rcx+-96]
+ vpxor ymm12, ymm6, [r8+64]
+ vpxor ymm13, ymm7, [r9+64]
+ vpxor ymm14, ymm8, [rcx+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+128]
+ vpxor ymm11, ymm8, [rcx+96]
+ vpxor ymm12, ymm9, [r8+96]
+ vpxor ymm13, ymm5, [rcx+-32]
+ vpxor ymm14, ymm6, [r8+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Round 8
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-32]
+ vpxor ymm12, ymm7, [rcx+128]
+ vpxor ymm13, ymm8, [r9+64]
+ vpxor ymm14, ymm9, [r8+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+256]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-96]
+ vpxor ymm11, ymm9, [r9+96]
+ vpxor ymm12, ymm5, [r8]
+ vpxor ymm13, ymm6, [rcx+-96]
+ vpxor ymm14, ymm7, [r8+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+32]
+ vpxor ymm11, ymm7, [rcx+-64]
+ vpxor ymm12, ymm8, [r8+128]
+ vpxor ymm13, ymm9, [rcx+32]
+ vpxor ymm14, ymm5, [r9+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-96]
+ vpxor ymm11, ymm5, [rcx+64]
+ vpxor ymm12, ymm6, [r9]
+ vpxor ymm13, ymm7, [r8+64]
+ vpxor ymm14, ymm8, [rcx+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+32]
+ vpxor ymm11, ymm8, [r8+-64]
+ vpxor ymm12, ymm9, [rcx]
+ vpxor ymm13, ymm5, [r9+-64]
+ vpxor ymm14, ymm6, [rcx+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Round 9
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm12, ymm2, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+96]
+ vpxor ymm12, ymm7, [r8+128]
+ vpxor ymm13, ymm8, [r8+64]
+ vpxor ymm14, ymm9, [rcx+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+288]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+64]
+ vpxor ymm11, ymm9, [r8+96]
+ vpxor ymm12, ymm5, [r8+32]
+ vpxor ymm13, ymm6, [rcx+64]
+ vpxor ymm14, ymm7, [rcx]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-32]
+ vpxor ymm11, ymm7, [r8]
+ vpxor ymm12, ymm8, [rcx+32]
+ vpxor ymm13, ymm9, [rcx+-32]
+ vpxor ymm14, ymm5, [r9+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-32]
+ vpxor ymm11, ymm5, [r8+-96]
+ vpxor ymm12, ymm6, [rcx+-64]
+ vpxor ymm13, ymm7, [r9]
+ vpxor ymm14, ymm8, [r9+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+128]
+ vpxor ymm11, ymm8, [rcx+-96]
+ vpxor ymm12, ymm9, [r9+128]
+ vpxor ymm13, ymm5, [r9+-96]
+ vpxor ymm14, ymm6, [r8+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Round 10
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm1, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+96]
+ vpxor ymm12, ymm7, [rcx+32]
+ vpxor ymm13, ymm8, [r9]
+ vpxor ymm14, ymm9, [r8+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+320]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+64]
+ vpxor ymm11, ymm9, [rcx]
+ vpxor ymm12, ymm5, [r9+-32]
+ vpxor ymm13, ymm6, [r8+-96]
+ vpxor ymm14, ymm7, [r9+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+96]
+ vpxor ymm11, ymm7, [r8+32]
+ vpxor ymm12, ymm8, [rcx+-32]
+ vpxor ymm13, ymm9, [r9+-64]
+ vpxor ymm14, ymm5, [rcx+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+96]
+ vpxor ymm11, ymm5, [r9+64]
+ vpxor ymm12, ymm6, [r8]
+ vpxor ymm13, ymm7, [rcx+-64]
+ vpxor ymm14, ymm8, [r9+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+128]
+ vpxor ymm11, ymm8, [rcx+64]
+ vpxor ymm12, ymm9, [r9+32]
+ vpxor ymm13, ymm5, [r8+-32]
+ vpxor ymm14, ymm6, [rcx+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Round 11
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm14, ymm4, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx]
+ vpxor ymm12, ymm7, [rcx+-32]
+ vpxor ymm13, ymm8, [rcx+-64]
+ vpxor ymm14, ymm9, [rcx+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+352]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9]
+ vpxor ymm11, ymm9, [r9+128]
+ vpxor ymm12, ymm5, [r9+96]
+ vpxor ymm13, ymm6, [r9+64]
+ vpxor ymm14, ymm7, [r9+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+96]
+ vpxor ymm11, ymm7, [r9+-32]
+ vpxor ymm12, ymm8, [r9+-64]
+ vpxor ymm13, ymm9, [r9+-96]
+ vpxor ymm14, ymm5, [r8+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-64]
+ vpxor ymm11, ymm5, [r8+64]
+ vpxor ymm12, ymm6, [r8+32]
+ vpxor ymm13, ymm7, [r8]
+ vpxor ymm14, ymm8, [r8+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+32]
+ vpxor ymm11, ymm8, [r8+-96]
+ vpxor ymm12, ymm9, [rcx+128]
+ vpxor ymm13, ymm5, [rcx+96]
+ vpxor ymm14, ymm6, [rcx+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Round 12
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+128]
+ vpxor ymm12, ymm7, [r9+-64]
+ vpxor ymm13, ymm8, [r8]
+ vpxor ymm14, ymm9, [rcx+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+384]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-64]
+ vpxor ymm11, ymm9, [r9+32]
+ vpxor ymm12, ymm5, [r8+96]
+ vpxor ymm13, ymm6, [r8+64]
+ vpxor ymm14, ymm7, [rcx+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx]
+ vpxor ymm11, ymm7, [r9+96]
+ vpxor ymm12, ymm8, [r9+-96]
+ vpxor ymm13, ymm9, [r8+-32]
+ vpxor ymm14, ymm5, [rcx+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-96]
+ vpxor ymm11, ymm5, [r9]
+ vpxor ymm12, ymm6, [r9+-32]
+ vpxor ymm13, ymm7, [r8+32]
+ vpxor ymm14, ymm8, [rcx+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-32]
+ vpxor ymm11, ymm8, [r9+64]
+ vpxor ymm12, ymm9, [r8+128]
+ vpxor ymm13, ymm5, [r8+-64]
+ vpxor ymm14, ymm6, [r8+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Round 13
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm12, ymm2, [r8+96]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm11, ymm1, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+32]
+ vpxor ymm12, ymm7, [r9+-96]
+ vpxor ymm13, ymm8, [r8+32]
+ vpxor ymm14, ymm9, [r8+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+416]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8]
+ vpxor ymm11, ymm9, [rcx+128]
+ vpxor ymm12, ymm5, [rcx]
+ vpxor ymm13, ymm6, [r9]
+ vpxor ymm14, ymm7, [r8+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+128]
+ vpxor ymm11, ymm7, [r8+96]
+ vpxor ymm12, ymm8, [r8+-32]
+ vpxor ymm13, ymm9, [rcx+96]
+ vpxor ymm14, ymm5, [rcx+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+64]
+ vpxor ymm11, ymm5, [rcx+-64]
+ vpxor ymm12, ymm6, [r9+96]
+ vpxor ymm13, ymm7, [r9+-32]
+ vpxor ymm14, ymm8, [r8+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-64]
+ vpxor ymm11, ymm8, [r8+64]
+ vpxor ymm12, ymm9, [rcx+32]
+ vpxor ymm13, ymm5, [rcx+-96]
+ vpxor ymm14, ymm6, [r9+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Round 14
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm13, ymm3, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+128]
+ vpxor ymm12, ymm7, [r8+-32]
+ vpxor ymm13, ymm8, [r9+-32]
+ vpxor ymm14, ymm9, [r9+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+448]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+32]
+ vpxor ymm11, ymm9, [r8+128]
+ vpxor ymm12, ymm5, [r9+128]
+ vpxor ymm13, ymm6, [rcx+-64]
+ vpxor ymm14, ymm7, [rcx+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+32]
+ vpxor ymm11, ymm7, [rcx]
+ vpxor ymm12, ymm8, [rcx+96]
+ vpxor ymm13, ymm9, [r8+-64]
+ vpxor ymm14, ymm5, [r9+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-96]
+ vpxor ymm11, ymm5, [r8]
+ vpxor ymm12, ymm6, [r8+96]
+ vpxor ymm13, ymm7, [r9+96]
+ vpxor ymm14, ymm8, [rcx+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-96]
+ vpxor ymm11, ymm8, [r9]
+ vpxor ymm12, ymm9, [rcx+-32]
+ vpxor ymm13, ymm5, [rcx+64]
+ vpxor ymm14, ymm6, [r8+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Round 15
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm12, ymm2, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+128]
+ vpxor ymm12, ymm7, [rcx+96]
+ vpxor ymm13, ymm8, [r9+96]
+ vpxor ymm14, ymm9, [r8+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+480]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-32]
+ vpxor ymm11, ymm9, [rcx+32]
+ vpxor ymm12, ymm5, [r9+32]
+ vpxor ymm13, ymm6, [r8]
+ vpxor ymm14, ymm7, [rcx+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+128]
+ vpxor ymm11, ymm7, [r9+128]
+ vpxor ymm12, ymm8, [r8+-64]
+ vpxor ymm13, ymm9, [rcx+-96]
+ vpxor ymm14, ymm5, [r9+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+64]
+ vpxor ymm11, ymm5, [r8+32]
+ vpxor ymm12, ymm6, [rcx]
+ vpxor ymm13, ymm7, [r8+96]
+ vpxor ymm14, ymm8, [rcx+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-32]
+ vpxor ymm11, ymm8, [rcx+-64]
+ vpxor ymm12, ymm9, [r9+-64]
+ vpxor ymm13, ymm5, [r8+-96]
+ vpxor ymm14, ymm6, [r9]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Round 16
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+32]
+ vpxor ymm12, ymm7, [r8+-64]
+ vpxor ymm13, ymm8, [r8+96]
+ vpxor ymm14, ymm9, [r9]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+512]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+96]
+ vpxor ymm11, ymm9, [rcx+-32]
+ vpxor ymm12, ymm5, [rcx+128]
+ vpxor ymm13, ymm6, [r8+32]
+ vpxor ymm14, ymm7, [r9+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+128]
+ vpxor ymm11, ymm7, [r9+32]
+ vpxor ymm12, ymm8, [rcx+-96]
+ vpxor ymm13, ymm9, [rcx+64]
+ vpxor ymm14, ymm5, [r8+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+64]
+ vpxor ymm11, ymm5, [r9+-32]
+ vpxor ymm12, ymm6, [r9+128]
+ vpxor ymm13, ymm7, [rcx]
+ vpxor ymm14, ymm8, [r8+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+96]
+ vpxor ymm11, ymm8, [r8]
+ vpxor ymm12, ymm9, [r9+-96]
+ vpxor ymm13, ymm5, [r9+64]
+ vpxor ymm14, ymm6, [rcx+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Round 17
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm14, ymm4, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-32]
+ vpxor ymm12, ymm7, [rcx+-96]
+ vpxor ymm13, ymm8, [rcx]
+ vpxor ymm14, ymm9, [rcx+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+544]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+96]
+ vpxor ymm11, ymm9, [r9+-64]
+ vpxor ymm12, ymm5, [r8+128]
+ vpxor ymm13, ymm6, [r9+-32]
+ vpxor ymm14, ymm7, [r9+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+32]
+ vpxor ymm11, ymm7, [rcx+128]
+ vpxor ymm12, ymm8, [rcx+64]
+ vpxor ymm13, ymm9, [r8+-96]
+ vpxor ymm14, ymm5, [rcx+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9]
+ vpxor ymm11, ymm5, [r9+96]
+ vpxor ymm12, ymm6, [r9+32]
+ vpxor ymm13, ymm7, [r9+128]
+ vpxor ymm14, ymm8, [r9+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-64]
+ vpxor ymm11, ymm8, [r8+32]
+ vpxor ymm12, ymm9, [r8+-32]
+ vpxor ymm13, ymm5, [r8+64]
+ vpxor ymm14, ymm6, [r8]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Round 18
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-64]
+ vpxor ymm12, ymm7, [rcx+64]
+ vpxor ymm13, ymm8, [r9+128]
+ vpxor ymm14, ymm9, [r8]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+576]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx]
+ vpxor ymm11, ymm9, [r9+-96]
+ vpxor ymm12, ymm5, [rcx+32]
+ vpxor ymm13, ymm6, [r9+96]
+ vpxor ymm14, ymm7, [r8+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-32]
+ vpxor ymm11, ymm7, [r8+128]
+ vpxor ymm12, ymm8, [r8+-96]
+ vpxor ymm13, ymm9, [r9+64]
+ vpxor ymm14, ymm5, [r8+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-64]
+ vpxor ymm11, ymm5, [r8+96]
+ vpxor ymm12, ymm6, [rcx+128]
+ vpxor ymm13, ymm7, [r9+32]
+ vpxor ymm14, ymm8, [r8+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-96]
+ vpxor ymm11, ymm8, [r9+-32]
+ vpxor ymm12, ymm9, [rcx+96]
+ vpxor ymm13, ymm5, [r9]
+ vpxor ymm14, ymm6, [r8+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Round 19
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm14, ymm4, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm11, ymm1, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm3, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-96]
+ vpxor ymm12, ymm7, [r8+-96]
+ vpxor ymm13, ymm8, [r9+32]
+ vpxor ymm14, ymm9, [r8+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+608]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+128]
+ vpxor ymm11, ymm9, [r8+-32]
+ vpxor ymm12, ymm5, [rcx+-32]
+ vpxor ymm13, ymm6, [r8+96]
+ vpxor ymm14, ymm7, [rcx+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-64]
+ vpxor ymm11, ymm7, [rcx+32]
+ vpxor ymm12, ymm8, [r9+64]
+ vpxor ymm13, ymm9, [r8+64]
+ vpxor ymm14, ymm5, [rcx+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8]
+ vpxor ymm11, ymm5, [rcx]
+ vpxor ymm12, ymm6, [r8+128]
+ vpxor ymm13, ymm7, [rcx+128]
+ vpxor ymm14, ymm8, [r9]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+64]
+ vpxor ymm11, ymm8, [r9+96]
+ vpxor ymm12, ymm9, [r8+-64]
+ vpxor ymm13, ymm5, [rcx+-64]
+ vpxor ymm14, ymm6, [r9+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Round 20
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm13, ymm3, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-32]
+ vpxor ymm12, ymm7, [r9+64]
+ vpxor ymm13, ymm8, [rcx+128]
+ vpxor ymm14, ymm9, [r9+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+640]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+32]
+ vpxor ymm11, ymm9, [rcx+96]
+ vpxor ymm12, ymm5, [r9+-64]
+ vpxor ymm13, ymm6, [rcx]
+ vpxor ymm14, ymm7, [r8+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-96]
+ vpxor ymm11, ymm7, [rcx+-32]
+ vpxor ymm12, ymm8, [r8+64]
+ vpxor ymm13, ymm9, [r9]
+ vpxor ymm14, ymm5, [rcx+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+32]
+ vpxor ymm11, ymm5, [r9+128]
+ vpxor ymm12, ymm6, [rcx+32]
+ vpxor ymm13, ymm7, [r8+128]
+ vpxor ymm14, ymm8, [rcx+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-96]
+ vpxor ymm11, ymm8, [r8+96]
+ vpxor ymm12, ymm9, [rcx+-96]
+ vpxor ymm13, ymm5, [r8]
+ vpxor ymm14, ymm6, [r9+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Round 21
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+96]
+ vpxor ymm12, ymm7, [r8+64]
+ vpxor ymm13, ymm8, [r8+128]
+ vpxor ymm14, ymm9, [r9+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+672]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+128]
+ vpxor ymm11, ymm9, [r8+-64]
+ vpxor ymm12, ymm5, [r9+-96]
+ vpxor ymm13, ymm6, [r9+128]
+ vpxor ymm14, ymm7, [rcx+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-32]
+ vpxor ymm11, ymm7, [r9+-64]
+ vpxor ymm12, ymm8, [r9]
+ vpxor ymm13, ymm9, [rcx+-64]
+ vpxor ymm14, ymm5, [r8+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-32]
+ vpxor ymm11, ymm5, [r9+32]
+ vpxor ymm12, ymm6, [rcx+-32]
+ vpxor ymm13, ymm7, [rcx+32]
+ vpxor ymm14, ymm8, [r8]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+64]
+ vpxor ymm11, ymm8, [rcx]
+ vpxor ymm12, ymm9, [rcx+64]
+ vpxor ymm13, ymm5, [r8+32]
+ vpxor ymm14, ymm6, [r8+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Round 22
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm1, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-64]
+ vpxor ymm12, ymm7, [r9]
+ vpxor ymm13, ymm8, [rcx+32]
+ vpxor ymm14, ymm9, [r8+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+704]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+128]
+ vpxor ymm11, ymm9, [rcx+-96]
+ vpxor ymm12, ymm5, [r8+-32]
+ vpxor ymm13, ymm6, [r9+32]
+ vpxor ymm14, ymm7, [rcx+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+96]
+ vpxor ymm11, ymm7, [r9+-96]
+ vpxor ymm12, ymm8, [rcx+-64]
+ vpxor ymm13, ymm9, [r8]
+ vpxor ymm14, ymm5, [r9+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+96]
+ vpxor ymm11, ymm5, [rcx+128]
+ vpxor ymm12, ymm6, [r9+-64]
+ vpxor ymm13, ymm7, [rcx+-32]
+ vpxor ymm14, ymm8, [r8+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+64]
+ vpxor ymm11, ymm8, [r9+128]
+ vpxor ymm12, ymm9, [r8+-96]
+ vpxor ymm13, ymm5, [r9+-32]
+ vpxor ymm14, ymm6, [rcx]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Round 23
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm14, ymm4, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-96]
+ vpxor ymm12, ymm7, [rcx+-64]
+ vpxor ymm13, ymm8, [rcx+-32]
+ vpxor ymm14, ymm9, [rcx]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+736]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+32]
+ vpxor ymm11, ymm9, [rcx+64]
+ vpxor ymm12, ymm5, [rcx+96]
+ vpxor ymm13, ymm6, [rcx+128]
+ vpxor ymm14, ymm7, [r8+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-64]
+ vpxor ymm11, ymm7, [r8+-32]
+ vpxor ymm12, ymm8, [r8]
+ vpxor ymm13, ymm9, [r8+32]
+ vpxor ymm14, ymm5, [r8+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+96]
+ vpxor ymm11, ymm5, [r8+128]
+ vpxor ymm12, ymm6, [r9+-96]
+ vpxor ymm13, ymm7, [r9+-64]
+ vpxor ymm14, ymm8, [r9+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9]
+ vpxor ymm11, ymm8, [r9+32]
+ vpxor ymm12, ymm9, [r9+64]
+ vpxor ymm13, ymm5, [r9+96]
+ vpxor ymm14, ymm6, [r9+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ sub rcx, 128
+ vmovdqu YMMWORD PTR [rcx], ymm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+sha3_128_blocksx4_seed_avx2 ENDP
+_TEXT ENDS
+ENDIF
+IFDEF WOLFSSL_HAVE_MLKEM
+_DATA SEGMENT
+ALIGN 16
+L_sha3_256_blockx4_seed_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h
+ QWORD 8000000000000000h, 8000000000000000h
+ptr_L_sha3_256_blockx4_seed_avx2_end_mark QWORD L_sha3_256_blockx4_seed_avx2_end_mark
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_256_blocksx4_seed_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r]
+ mov r8, rcx
+ mov r9, rcx
+ vpbroadcastq ymm15, QWORD PTR [rdx]
+ add rcx, 128
+ vpbroadcastq ymm11, QWORD PTR [rdx+8]
+ add r8, 384
+ vpbroadcastq ymm12, QWORD PTR [rdx+16]
+ add r9, 640
+ vpbroadcastq ymm13, QWORD PTR [rdx+24]
+ vmovdqu ymm5, YMMWORD PTR L_sha3_256_blockx4_seed_avx2_end_mark
+ vpxor ymm6, ymm6, ymm6
+ vmovdqu YMMWORD PTR [rcx+-96], ymm11
+ vmovdqu YMMWORD PTR [rcx+-64], ymm12
+ vmovdqu YMMWORD PTR [rcx+-32], ymm13
+ vmovdqu ymm14, YMMWORD PTR [rcx]
+ vmovdqu YMMWORD PTR [rcx+32], ymm6
+ vmovdqu YMMWORD PTR [rcx+64], ymm6
+ vmovdqu YMMWORD PTR [rcx+96], ymm6
+ vmovdqu YMMWORD PTR [rcx+128], ymm6
+ vmovdqu YMMWORD PTR [r8+-96], ymm6
+ vmovdqu YMMWORD PTR [r8+-64], ymm6
+ vmovdqu YMMWORD PTR [r8+-32], ymm6
+ vmovdqu YMMWORD PTR [r8], ymm6
+ vmovdqu YMMWORD PTR [r8+32], ymm6
+ vmovdqu YMMWORD PTR [r8+64], ymm6
+ vmovdqu YMMWORD PTR [r8+96], ymm6
+ vmovdqu YMMWORD PTR [r8+128], ymm5
+ vmovdqu YMMWORD PTR [r9+-96], ymm6
+ vmovdqu YMMWORD PTR [r9+-64], ymm6
+ vmovdqu YMMWORD PTR [r9+-32], ymm6
+ vmovdqu YMMWORD PTR [r9], ymm6
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [r9+64], ymm6
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ vmovdqu YMMWORD PTR [r9+128], ymm6
+ vmovdqu ymm10, ymm15
+ vpxor ymm11, ymm11, ymm5
+ ; Round 0
+ ; Calc b[0..4]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+64]
+ vpxor ymm12, ymm7, [r8]
+ vpxor ymm13, ymm8, [r9+-64]
+ vpxor ymm14, ymm9, [r9+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-32]
+ vpxor ymm11, ymm9, [r8+-96]
+ vpxor ymm12, ymm5, [r8+-64]
+ vpxor ymm13, ymm6, [r8+128]
+ vpxor ymm14, ymm7, [r9+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-96]
+ vpxor ymm11, ymm7, [rcx+96]
+ vpxor ymm12, ymm8, [r8+32]
+ vpxor ymm13, ymm9, [r9+-32]
+ vpxor ymm14, ymm5, [r9]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx]
+ vpxor ymm11, ymm5, [rcx+32]
+ vpxor ymm12, ymm6, [r8+-32]
+ vpxor ymm13, ymm7, [r9+-96]
+ vpxor ymm14, ymm8, [r9+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-64]
+ vpxor ymm11, ymm8, [rcx+128]
+ vpxor ymm12, ymm9, [r8+64]
+ vpxor ymm13, ymm5, [r8+96]
+ vpxor ymm14, ymm6, [r9+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Round 1
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm12, ymm2, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm3, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm14, ymm4, [r9]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-96]
+ vpxor ymm12, ymm7, [r8+32]
+ vpxor ymm13, ymm8, [r9+-96]
+ vpxor ymm14, ymm9, [r9+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+32]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-64]
+ vpxor ymm11, ymm9, [r9+64]
+ vpxor ymm12, ymm5, [rcx+-96]
+ vpxor ymm13, ymm6, [rcx+32]
+ vpxor ymm14, ymm7, [r8+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+64]
+ vpxor ymm11, ymm7, [r8+-64]
+ vpxor ymm12, ymm8, [r9+-32]
+ vpxor ymm13, ymm9, [r9+96]
+ vpxor ymm14, ymm5, [rcx+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+128]
+ vpxor ymm11, ymm5, [rcx+-32]
+ vpxor ymm12, ymm6, [rcx+96]
+ vpxor ymm13, ymm7, [r8+-32]
+ vpxor ymm14, ymm8, [r8+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8]
+ vpxor ymm11, ymm8, [r8+128]
+ vpxor ymm12, ymm9, [r9]
+ vpxor ymm13, ymm5, [rcx]
+ vpxor ymm14, ymm6, [rcx+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Round 2
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+64]
+ vpxor ymm12, ymm7, [r9+-32]
+ vpxor ymm13, ymm8, [r8+-32]
+ vpxor ymm14, ymm9, [rcx+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+64]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-96]
+ vpxor ymm11, ymm9, [r8+64]
+ vpxor ymm12, ymm5, [rcx+64]
+ vpxor ymm13, ymm6, [rcx+-32]
+ vpxor ymm14, ymm7, [r9]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-96]
+ vpxor ymm11, ymm7, [rcx+-96]
+ vpxor ymm12, ymm8, [r9+96]
+ vpxor ymm13, ymm9, [r8+96]
+ vpxor ymm14, ymm5, [r8]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+32]
+ vpxor ymm11, ymm5, [r9+-64]
+ vpxor ymm12, ymm6, [r8+-64]
+ vpxor ymm13, ymm7, [rcx+96]
+ vpxor ymm14, ymm8, [rcx]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+32]
+ vpxor ymm11, ymm8, [rcx+32]
+ vpxor ymm12, ymm9, [rcx+-64]
+ vpxor ymm13, ymm5, [r9+128]
+ vpxor ymm14, ymm6, [r8+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Round 3
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm2, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+64]
+ vpxor ymm12, ymm7, [r9+96]
+ vpxor ymm13, ymm8, [rcx+96]
+ vpxor ymm14, ymm9, [r8+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+96]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-32]
+ vpxor ymm11, ymm9, [r9]
+ vpxor ymm12, ymm5, [r8+-96]
+ vpxor ymm13, ymm6, [r9+-64]
+ vpxor ymm14, ymm7, [rcx+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+64]
+ vpxor ymm11, ymm7, [rcx+64]
+ vpxor ymm12, ymm8, [r8+96]
+ vpxor ymm13, ymm9, [rcx]
+ vpxor ymm14, ymm5, [r8+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+128]
+ vpxor ymm11, ymm5, [r9+-96]
+ vpxor ymm12, ymm6, [rcx+-96]
+ vpxor ymm13, ymm7, [r8+-64]
+ vpxor ymm14, ymm8, [r9+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-32]
+ vpxor ymm11, ymm8, [rcx+-32]
+ vpxor ymm12, ymm9, [r8]
+ vpxor ymm13, ymm5, [r9+32]
+ vpxor ymm14, ymm6, [rcx+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Round 4
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm1, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9]
+ vpxor ymm12, ymm7, [r8+96]
+ vpxor ymm13, ymm8, [r8+-64]
+ vpxor ymm14, ymm9, [rcx+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+128]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+96]
+ vpxor ymm11, ymm9, [rcx+-64]
+ vpxor ymm12, ymm5, [r9+64]
+ vpxor ymm13, ymm6, [r9+-96]
+ vpxor ymm14, ymm7, [r8]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+64]
+ vpxor ymm11, ymm7, [r8+-96]
+ vpxor ymm12, ymm8, [rcx]
+ vpxor ymm13, ymm9, [r9+128]
+ vpxor ymm14, ymm5, [r9+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+128]
+ vpxor ymm11, ymm5, [r8+-32]
+ vpxor ymm12, ymm6, [rcx+64]
+ vpxor ymm13, ymm7, [rcx+-96]
+ vpxor ymm14, ymm8, [r9+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+96]
+ vpxor ymm11, ymm8, [r9+-64]
+ vpxor ymm12, ymm9, [r8+32]
+ vpxor ymm13, ymm5, [rcx+128]
+ vpxor ymm14, ymm6, [rcx+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Round 5
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-64]
+ vpxor ymm12, ymm7, [rcx]
+ vpxor ymm13, ymm8, [rcx+-96]
+ vpxor ymm14, ymm9, [rcx+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+160]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-64]
+ vpxor ymm11, ymm9, [r8]
+ vpxor ymm12, ymm5, [r8+64]
+ vpxor ymm13, ymm6, [r8+-32]
+ vpxor ymm14, ymm7, [r8+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9]
+ vpxor ymm11, ymm7, [r9+64]
+ vpxor ymm12, ymm8, [r9+128]
+ vpxor ymm13, ymm9, [r9+32]
+ vpxor ymm14, ymm5, [r9+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+32]
+ vpxor ymm11, ymm5, [rcx+96]
+ vpxor ymm12, ymm6, [r8+-96]
+ vpxor ymm13, ymm7, [rcx+64]
+ vpxor ymm14, ymm8, [rcx+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+96]
+ vpxor ymm11, ymm8, [r9+-96]
+ vpxor ymm12, ymm9, [r9+-32]
+ vpxor ymm13, ymm5, [r8+128]
+ vpxor ymm14, ymm6, [r9+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Round 6
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8]
+ vpxor ymm12, ymm7, [r9+128]
+ vpxor ymm13, ymm8, [rcx+64]
+ vpxor ymm14, ymm9, [r9+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+192]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-96]
+ vpxor ymm11, ymm9, [r8+32]
+ vpxor ymm12, ymm5, [r9]
+ vpxor ymm13, ymm6, [rcx+96]
+ vpxor ymm14, ymm7, [r9+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-64]
+ vpxor ymm11, ymm7, [r8+64]
+ vpxor ymm12, ymm8, [r9+32]
+ vpxor ymm13, ymm9, [rcx+128]
+ vpxor ymm14, ymm5, [r8+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-32]
+ vpxor ymm11, ymm5, [r8+-64]
+ vpxor ymm12, ymm6, [r9+64]
+ vpxor ymm13, ymm7, [r8+-96]
+ vpxor ymm14, ymm8, [r8+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx]
+ vpxor ymm11, ymm8, [r8+-32]
+ vpxor ymm12, ymm9, [r9+96]
+ vpxor ymm13, ymm5, [rcx+32]
+ vpxor ymm14, ymm6, [r9+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Round 7
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm11, ymm1, [r8+-64]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm14, ymm4, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm12, ymm2, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+32]
+ vpxor ymm12, ymm7, [r9+32]
+ vpxor ymm13, ymm8, [r8+-96]
+ vpxor ymm14, ymm9, [r9+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+224]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+64]
+ vpxor ymm11, ymm9, [r9+-32]
+ vpxor ymm12, ymm5, [rcx+-64]
+ vpxor ymm13, ymm6, [r8+-64]
+ vpxor ymm14, ymm7, [r9+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8]
+ vpxor ymm11, ymm7, [r9]
+ vpxor ymm12, ymm8, [rcx+128]
+ vpxor ymm13, ymm9, [r8+128]
+ vpxor ymm14, ymm5, [rcx]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-64]
+ vpxor ymm11, ymm5, [rcx+-96]
+ vpxor ymm12, ymm6, [r8+64]
+ vpxor ymm13, ymm7, [r9+64]
+ vpxor ymm14, ymm8, [rcx+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+128]
+ vpxor ymm11, ymm8, [rcx+96]
+ vpxor ymm12, ymm9, [r8+96]
+ vpxor ymm13, ymm5, [rcx+-32]
+ vpxor ymm14, ymm6, [r8+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Round 8
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-32]
+ vpxor ymm12, ymm7, [rcx+128]
+ vpxor ymm13, ymm8, [r9+64]
+ vpxor ymm14, ymm9, [r8+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+256]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-96]
+ vpxor ymm11, ymm9, [r9+96]
+ vpxor ymm12, ymm5, [r8]
+ vpxor ymm13, ymm6, [rcx+-96]
+ vpxor ymm14, ymm7, [r8+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+32]
+ vpxor ymm11, ymm7, [rcx+-64]
+ vpxor ymm12, ymm8, [r8+128]
+ vpxor ymm13, ymm9, [rcx+32]
+ vpxor ymm14, ymm5, [r9+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-96]
+ vpxor ymm11, ymm5, [rcx+64]
+ vpxor ymm12, ymm6, [r9]
+ vpxor ymm13, ymm7, [r8+64]
+ vpxor ymm14, ymm8, [rcx+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+32]
+ vpxor ymm11, ymm8, [r8+-64]
+ vpxor ymm12, ymm9, [rcx]
+ vpxor ymm13, ymm5, [r9+-64]
+ vpxor ymm14, ymm6, [rcx+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Round 9
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm12, ymm2, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+96]
+ vpxor ymm12, ymm7, [r8+128]
+ vpxor ymm13, ymm8, [r8+64]
+ vpxor ymm14, ymm9, [rcx+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+288]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+64]
+ vpxor ymm11, ymm9, [r8+96]
+ vpxor ymm12, ymm5, [r8+32]
+ vpxor ymm13, ymm6, [rcx+64]
+ vpxor ymm14, ymm7, [rcx]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-32]
+ vpxor ymm11, ymm7, [r8]
+ vpxor ymm12, ymm8, [rcx+32]
+ vpxor ymm13, ymm9, [rcx+-32]
+ vpxor ymm14, ymm5, [r9+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-32]
+ vpxor ymm11, ymm5, [r8+-96]
+ vpxor ymm12, ymm6, [rcx+-64]
+ vpxor ymm13, ymm7, [r9]
+ vpxor ymm14, ymm8, [r9+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+128]
+ vpxor ymm11, ymm8, [rcx+-96]
+ vpxor ymm12, ymm9, [r9+128]
+ vpxor ymm13, ymm5, [r9+-96]
+ vpxor ymm14, ymm6, [r8+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Round 10
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm1, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+96]
+ vpxor ymm12, ymm7, [rcx+32]
+ vpxor ymm13, ymm8, [r9]
+ vpxor ymm14, ymm9, [r8+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+320]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+64]
+ vpxor ymm11, ymm9, [rcx]
+ vpxor ymm12, ymm5, [r9+-32]
+ vpxor ymm13, ymm6, [r8+-96]
+ vpxor ymm14, ymm7, [r9+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+96]
+ vpxor ymm11, ymm7, [r8+32]
+ vpxor ymm12, ymm8, [rcx+-32]
+ vpxor ymm13, ymm9, [r9+-64]
+ vpxor ymm14, ymm5, [rcx+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+96]
+ vpxor ymm11, ymm5, [r9+64]
+ vpxor ymm12, ymm6, [r8]
+ vpxor ymm13, ymm7, [rcx+-64]
+ vpxor ymm14, ymm8, [r9+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+128]
+ vpxor ymm11, ymm8, [rcx+64]
+ vpxor ymm12, ymm9, [r9+32]
+ vpxor ymm13, ymm5, [r8+-32]
+ vpxor ymm14, ymm6, [rcx+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Round 11
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm14, ymm4, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx]
+ vpxor ymm12, ymm7, [rcx+-32]
+ vpxor ymm13, ymm8, [rcx+-64]
+ vpxor ymm14, ymm9, [rcx+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+352]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9]
+ vpxor ymm11, ymm9, [r9+128]
+ vpxor ymm12, ymm5, [r9+96]
+ vpxor ymm13, ymm6, [r9+64]
+ vpxor ymm14, ymm7, [r9+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+96]
+ vpxor ymm11, ymm7, [r9+-32]
+ vpxor ymm12, ymm8, [r9+-64]
+ vpxor ymm13, ymm9, [r9+-96]
+ vpxor ymm14, ymm5, [r8+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-64]
+ vpxor ymm11, ymm5, [r8+64]
+ vpxor ymm12, ymm6, [r8+32]
+ vpxor ymm13, ymm7, [r8]
+ vpxor ymm14, ymm8, [r8+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+32]
+ vpxor ymm11, ymm8, [r8+-96]
+ vpxor ymm12, ymm9, [rcx+128]
+ vpxor ymm13, ymm5, [rcx+96]
+ vpxor ymm14, ymm6, [rcx+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Round 12
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+128]
+ vpxor ymm12, ymm7, [r9+-64]
+ vpxor ymm13, ymm8, [r8]
+ vpxor ymm14, ymm9, [rcx+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+384]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-64]
+ vpxor ymm11, ymm9, [r9+32]
+ vpxor ymm12, ymm5, [r8+96]
+ vpxor ymm13, ymm6, [r8+64]
+ vpxor ymm14, ymm7, [rcx+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx]
+ vpxor ymm11, ymm7, [r9+96]
+ vpxor ymm12, ymm8, [r9+-96]
+ vpxor ymm13, ymm9, [r8+-32]
+ vpxor ymm14, ymm5, [rcx+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-96]
+ vpxor ymm11, ymm5, [r9]
+ vpxor ymm12, ymm6, [r9+-32]
+ vpxor ymm13, ymm7, [r8+32]
+ vpxor ymm14, ymm8, [rcx+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-32]
+ vpxor ymm11, ymm8, [r9+64]
+ vpxor ymm12, ymm9, [r8+128]
+ vpxor ymm13, ymm5, [r8+-64]
+ vpxor ymm14, ymm6, [r8+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Round 13
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm12, ymm2, [r8+96]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm11, ymm1, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+32]
+ vpxor ymm12, ymm7, [r9+-96]
+ vpxor ymm13, ymm8, [r8+32]
+ vpxor ymm14, ymm9, [r8+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+416]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8]
+ vpxor ymm11, ymm9, [rcx+128]
+ vpxor ymm12, ymm5, [rcx]
+ vpxor ymm13, ymm6, [r9]
+ vpxor ymm14, ymm7, [r8+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+128]
+ vpxor ymm11, ymm7, [r8+96]
+ vpxor ymm12, ymm8, [r8+-32]
+ vpxor ymm13, ymm9, [rcx+96]
+ vpxor ymm14, ymm5, [rcx+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+64]
+ vpxor ymm11, ymm5, [rcx+-64]
+ vpxor ymm12, ymm6, [r9+96]
+ vpxor ymm13, ymm7, [r9+-32]
+ vpxor ymm14, ymm8, [r8+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-64]
+ vpxor ymm11, ymm8, [r8+64]
+ vpxor ymm12, ymm9, [rcx+32]
+ vpxor ymm13, ymm5, [rcx+-96]
+ vpxor ymm14, ymm6, [r9+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Round 14
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm13, ymm3, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+128]
+ vpxor ymm12, ymm7, [r8+-32]
+ vpxor ymm13, ymm8, [r9+-32]
+ vpxor ymm14, ymm9, [r9+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+448]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+32]
+ vpxor ymm11, ymm9, [r8+128]
+ vpxor ymm12, ymm5, [r9+128]
+ vpxor ymm13, ymm6, [rcx+-64]
+ vpxor ymm14, ymm7, [rcx+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+32]
+ vpxor ymm11, ymm7, [rcx]
+ vpxor ymm12, ymm8, [rcx+96]
+ vpxor ymm13, ymm9, [r8+-64]
+ vpxor ymm14, ymm5, [r9+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-96]
+ vpxor ymm11, ymm5, [r8]
+ vpxor ymm12, ymm6, [r8+96]
+ vpxor ymm13, ymm7, [r9+96]
+ vpxor ymm14, ymm8, [rcx+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-96]
+ vpxor ymm11, ymm8, [r9]
+ vpxor ymm12, ymm9, [rcx+-32]
+ vpxor ymm13, ymm5, [rcx+64]
+ vpxor ymm14, ymm6, [r8+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Round 15
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm12, ymm2, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+128]
+ vpxor ymm12, ymm7, [rcx+96]
+ vpxor ymm13, ymm8, [r9+96]
+ vpxor ymm14, ymm9, [r8+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+480]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-32]
+ vpxor ymm11, ymm9, [rcx+32]
+ vpxor ymm12, ymm5, [r9+32]
+ vpxor ymm13, ymm6, [r8]
+ vpxor ymm14, ymm7, [rcx+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+128]
+ vpxor ymm11, ymm7, [r9+128]
+ vpxor ymm12, ymm8, [r8+-64]
+ vpxor ymm13, ymm9, [rcx+-96]
+ vpxor ymm14, ymm5, [r9+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+64]
+ vpxor ymm11, ymm5, [r8+32]
+ vpxor ymm12, ymm6, [rcx]
+ vpxor ymm13, ymm7, [r8+96]
+ vpxor ymm14, ymm8, [rcx+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-32]
+ vpxor ymm11, ymm8, [rcx+-64]
+ vpxor ymm12, ymm9, [r9+-64]
+ vpxor ymm13, ymm5, [r8+-96]
+ vpxor ymm14, ymm6, [r9]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Round 16
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+32]
+ vpxor ymm12, ymm7, [r8+-64]
+ vpxor ymm13, ymm8, [r8+96]
+ vpxor ymm14, ymm9, [r9]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+512]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+96]
+ vpxor ymm11, ymm9, [rcx+-32]
+ vpxor ymm12, ymm5, [rcx+128]
+ vpxor ymm13, ymm6, [r8+32]
+ vpxor ymm14, ymm7, [r9+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+128]
+ vpxor ymm11, ymm7, [r9+32]
+ vpxor ymm12, ymm8, [rcx+-96]
+ vpxor ymm13, ymm9, [rcx+64]
+ vpxor ymm14, ymm5, [r8+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+64]
+ vpxor ymm11, ymm5, [r9+-32]
+ vpxor ymm12, ymm6, [r9+128]
+ vpxor ymm13, ymm7, [rcx]
+ vpxor ymm14, ymm8, [r8+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+96]
+ vpxor ymm11, ymm8, [r8]
+ vpxor ymm12, ymm9, [r9+-96]
+ vpxor ymm13, ymm5, [r9+64]
+ vpxor ymm14, ymm6, [rcx+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Round 17
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm14, ymm4, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-32]
+ vpxor ymm12, ymm7, [rcx+-96]
+ vpxor ymm13, ymm8, [rcx]
+ vpxor ymm14, ymm9, [rcx+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+544]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+96]
+ vpxor ymm11, ymm9, [r9+-64]
+ vpxor ymm12, ymm5, [r8+128]
+ vpxor ymm13, ymm6, [r9+-32]
+ vpxor ymm14, ymm7, [r9+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+32]
+ vpxor ymm11, ymm7, [rcx+128]
+ vpxor ymm12, ymm8, [rcx+64]
+ vpxor ymm13, ymm9, [r8+-96]
+ vpxor ymm14, ymm5, [rcx+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9]
+ vpxor ymm11, ymm5, [r9+96]
+ vpxor ymm12, ymm6, [r9+32]
+ vpxor ymm13, ymm7, [r9+128]
+ vpxor ymm14, ymm8, [r9+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-64]
+ vpxor ymm11, ymm8, [r8+32]
+ vpxor ymm12, ymm9, [r8+-32]
+ vpxor ymm13, ymm5, [r8+64]
+ vpxor ymm14, ymm6, [r8]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Round 18
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-64]
+ vpxor ymm12, ymm7, [rcx+64]
+ vpxor ymm13, ymm8, [r9+128]
+ vpxor ymm14, ymm9, [r8]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+576]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx]
+ vpxor ymm11, ymm9, [r9+-96]
+ vpxor ymm12, ymm5, [rcx+32]
+ vpxor ymm13, ymm6, [r9+96]
+ vpxor ymm14, ymm7, [r8+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-32]
+ vpxor ymm11, ymm7, [r8+128]
+ vpxor ymm12, ymm8, [r8+-96]
+ vpxor ymm13, ymm9, [r9+64]
+ vpxor ymm14, ymm5, [r8+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-64]
+ vpxor ymm11, ymm5, [r8+96]
+ vpxor ymm12, ymm6, [rcx+128]
+ vpxor ymm13, ymm7, [r9+32]
+ vpxor ymm14, ymm8, [r8+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-96]
+ vpxor ymm11, ymm8, [r9+-32]
+ vpxor ymm12, ymm9, [rcx+96]
+ vpxor ymm13, ymm5, [r9]
+ vpxor ymm14, ymm6, [r8+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Round 19
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm14, ymm4, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm11, ymm1, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm3, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-96]
+ vpxor ymm12, ymm7, [r8+-96]
+ vpxor ymm13, ymm8, [r9+32]
+ vpxor ymm14, ymm9, [r8+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+608]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+128]
+ vpxor ymm11, ymm9, [r8+-32]
+ vpxor ymm12, ymm5, [rcx+-32]
+ vpxor ymm13, ymm6, [r8+96]
+ vpxor ymm14, ymm7, [rcx+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-64]
+ vpxor ymm11, ymm7, [rcx+32]
+ vpxor ymm12, ymm8, [r9+64]
+ vpxor ymm13, ymm9, [r8+64]
+ vpxor ymm14, ymm5, [rcx+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8]
+ vpxor ymm11, ymm5, [rcx]
+ vpxor ymm12, ymm6, [r8+128]
+ vpxor ymm13, ymm7, [rcx+128]
+ vpxor ymm14, ymm8, [r9]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+64]
+ vpxor ymm11, ymm8, [r9+96]
+ vpxor ymm12, ymm9, [r8+-64]
+ vpxor ymm13, ymm5, [rcx+-64]
+ vpxor ymm14, ymm6, [r9+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Round 20
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm13, ymm3, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-32]
+ vpxor ymm12, ymm7, [r9+64]
+ vpxor ymm13, ymm8, [rcx+128]
+ vpxor ymm14, ymm9, [r9+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+640]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+32]
+ vpxor ymm11, ymm9, [rcx+96]
+ vpxor ymm12, ymm5, [r9+-64]
+ vpxor ymm13, ymm6, [rcx]
+ vpxor ymm14, ymm7, [r8+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-96]
+ vpxor ymm11, ymm7, [rcx+-32]
+ vpxor ymm12, ymm8, [r8+64]
+ vpxor ymm13, ymm9, [r9]
+ vpxor ymm14, ymm5, [rcx+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+32]
+ vpxor ymm11, ymm5, [r9+128]
+ vpxor ymm12, ymm6, [rcx+32]
+ vpxor ymm13, ymm7, [r8+128]
+ vpxor ymm14, ymm8, [rcx+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-96]
+ vpxor ymm11, ymm8, [r8+96]
+ vpxor ymm12, ymm9, [rcx+-96]
+ vpxor ymm13, ymm5, [r8]
+ vpxor ymm14, ymm6, [r9+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Round 21
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+96]
+ vpxor ymm12, ymm7, [r8+64]
+ vpxor ymm13, ymm8, [r8+128]
+ vpxor ymm14, ymm9, [r9+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+672]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+128]
+ vpxor ymm11, ymm9, [r8+-64]
+ vpxor ymm12, ymm5, [r9+-96]
+ vpxor ymm13, ymm6, [r9+128]
+ vpxor ymm14, ymm7, [rcx+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-32]
+ vpxor ymm11, ymm7, [r9+-64]
+ vpxor ymm12, ymm8, [r9]
+ vpxor ymm13, ymm9, [rcx+-64]
+ vpxor ymm14, ymm5, [r8+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-32]
+ vpxor ymm11, ymm5, [r9+32]
+ vpxor ymm12, ymm6, [rcx+-32]
+ vpxor ymm13, ymm7, [rcx+32]
+ vpxor ymm14, ymm8, [r8]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+64]
+ vpxor ymm11, ymm8, [rcx]
+ vpxor ymm12, ymm9, [rcx+64]
+ vpxor ymm13, ymm5, [r8+32]
+ vpxor ymm14, ymm6, [r8+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Round 22
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm1, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-64]
+ vpxor ymm12, ymm7, [r9]
+ vpxor ymm13, ymm8, [rcx+32]
+ vpxor ymm14, ymm9, [r8+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+704]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+128]
+ vpxor ymm11, ymm9, [rcx+-96]
+ vpxor ymm12, ymm5, [r8+-32]
+ vpxor ymm13, ymm6, [r9+32]
+ vpxor ymm14, ymm7, [rcx+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+96]
+ vpxor ymm11, ymm7, [r9+-96]
+ vpxor ymm12, ymm8, [rcx+-64]
+ vpxor ymm13, ymm9, [r8]
+ vpxor ymm14, ymm5, [r9+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+96]
+ vpxor ymm11, ymm5, [rcx+128]
+ vpxor ymm12, ymm6, [r9+-64]
+ vpxor ymm13, ymm7, [rcx+-32]
+ vpxor ymm14, ymm8, [r8+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+64]
+ vpxor ymm11, ymm8, [r9+128]
+ vpxor ymm12, ymm9, [r8+-96]
+ vpxor ymm13, ymm5, [r9+-32]
+ vpxor ymm14, ymm6, [rcx]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Round 23
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm14, ymm4, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-96]
+ vpxor ymm12, ymm7, [rcx+-64]
+ vpxor ymm13, ymm8, [rcx+-32]
+ vpxor ymm14, ymm9, [rcx]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+736]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+32]
+ vpxor ymm11, ymm9, [rcx+64]
+ vpxor ymm12, ymm5, [rcx+96]
+ vpxor ymm13, ymm6, [rcx+128]
+ vpxor ymm14, ymm7, [r8+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-64]
+ vpxor ymm11, ymm7, [r8+-32]
+ vpxor ymm12, ymm8, [r8]
+ vpxor ymm13, ymm9, [r8+32]
+ vpxor ymm14, ymm5, [r8+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+96]
+ vpxor ymm11, ymm5, [r8+128]
+ vpxor ymm12, ymm6, [r9+-96]
+ vpxor ymm13, ymm7, [r9+-64]
+ vpxor ymm14, ymm8, [r9+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9]
+ vpxor ymm11, ymm8, [r9+32]
+ vpxor ymm12, ymm9, [r9+64]
+ vpxor ymm13, ymm5, [r9+96]
+ vpxor ymm14, ymm6, [r9+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ sub rcx, 128
+ vmovdqu YMMWORD PTR [rcx], ymm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+sha3_256_blocksx4_seed_avx2 ENDP
+_TEXT ENDS
+ENDIF
+IFDEF WOLFSSL_HAVE_MLDSA
+_DATA SEGMENT
+ALIGN 16
+L_sha3_256_blockx4_seed_64_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h
+ QWORD 8000000000000000h, 8000000000000000h
+ptr_L_sha3_256_blockx4_seed_64_avx2_end_mark QWORD L_sha3_256_blockx4_seed_64_avx2_end_mark
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+sha3_256_blocksx4_seed_64_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r]
+ mov r8, rcx
+ mov r9, rcx
+ vpbroadcastq ymm15, QWORD PTR [rdx]
+ add rcx, 128
+ vpbroadcastq ymm11, QWORD PTR [rdx+8]
+ add r8, 384
+ vpbroadcastq ymm12, QWORD PTR [rdx+16]
+ add r9, 640
+ vpbroadcastq ymm13, QWORD PTR [rdx+24]
+ vpbroadcastq ymm14, QWORD PTR [rdx+32]
+ vpbroadcastq ymm0, QWORD PTR [rdx+40]
+ vpbroadcastq ymm1, QWORD PTR [rdx+48]
+ vpbroadcastq ymm2, QWORD PTR [rdx+56]
+ vmovdqu ymm3, YMMWORD PTR [rcx+128]
+ vmovdqu YMMWORD PTR [rcx+-96], ymm11
+ vmovdqu YMMWORD PTR [rcx+-64], ymm12
+ vmovdqu YMMWORD PTR [rcx+-32], ymm13
+ vmovdqu YMMWORD PTR [rcx], ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vpxor ymm4, ymm4, ymm4
+ vpxor ymm6, ymm6, ymm6
+ vmovdqu ymm5, YMMWORD PTR L_sha3_256_blockx4_seed_64_avx2_end_mark
+ vmovdqu YMMWORD PTR [r8+-96], ymm6
+ vmovdqu YMMWORD PTR [r8+-64], ymm6
+ vmovdqu YMMWORD PTR [r8+-32], ymm6
+ vmovdqu YMMWORD PTR [r8], ymm6
+ vmovdqu YMMWORD PTR [r8+32], ymm6
+ vmovdqu YMMWORD PTR [r8+64], ymm6
+ vmovdqu YMMWORD PTR [r8+96], ymm6
+ vmovdqu YMMWORD PTR [r8+128], ymm5
+ vmovdqu YMMWORD PTR [r9+-96], ymm6
+ vmovdqu YMMWORD PTR [r9+-64], ymm6
+ vmovdqu YMMWORD PTR [r9+-32], ymm6
+ vmovdqu YMMWORD PTR [r9], ymm6
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [r9+64], ymm6
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ vmovdqu YMMWORD PTR [r9+128], ymm6
+ vmovdqu ymm10, ymm15
+ ; Round 0
+ ; Calc b[0..4]
+ vpxor ymm10, ymm15, ymm0
+ vpxor ymm11, ymm11, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpxor ymm14, ymm14, ymm4
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+64]
+ vpxor ymm12, ymm7, [r8]
+ vpxor ymm13, ymm8, [r9+-64]
+ vpxor ymm14, ymm9, [r9+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-32]
+ vpxor ymm11, ymm9, [r8+-96]
+ vpxor ymm12, ymm5, [r8+-64]
+ vpxor ymm13, ymm6, [r8+128]
+ vpxor ymm14, ymm7, [r9+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-96]
+ vpxor ymm11, ymm7, [rcx+96]
+ vpxor ymm12, ymm8, [r8+32]
+ vpxor ymm13, ymm9, [r9+-32]
+ vpxor ymm14, ymm5, [r9]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx]
+ vpxor ymm11, ymm5, [rcx+32]
+ vpxor ymm12, ymm6, [r8+-32]
+ vpxor ymm13, ymm7, [r9+-96]
+ vpxor ymm14, ymm8, [r9+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-64]
+ vpxor ymm11, ymm8, [rcx+128]
+ vpxor ymm12, ymm9, [r8+64]
+ vpxor ymm13, ymm5, [r8+96]
+ vpxor ymm14, ymm6, [r9+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Round 1
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm12, ymm2, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm3, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm14, ymm4, [r9]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-96]
+ vpxor ymm12, ymm7, [r8+32]
+ vpxor ymm13, ymm8, [r9+-96]
+ vpxor ymm14, ymm9, [r9+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+32]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-64]
+ vpxor ymm11, ymm9, [r9+64]
+ vpxor ymm12, ymm5, [rcx+-96]
+ vpxor ymm13, ymm6, [rcx+32]
+ vpxor ymm14, ymm7, [r8+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+64]
+ vpxor ymm11, ymm7, [r8+-64]
+ vpxor ymm12, ymm8, [r9+-32]
+ vpxor ymm13, ymm9, [r9+96]
+ vpxor ymm14, ymm5, [rcx+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+128]
+ vpxor ymm11, ymm5, [rcx+-32]
+ vpxor ymm12, ymm6, [rcx+96]
+ vpxor ymm13, ymm7, [r8+-32]
+ vpxor ymm14, ymm8, [r8+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8]
+ vpxor ymm11, ymm8, [r8+128]
+ vpxor ymm12, ymm9, [r9]
+ vpxor ymm13, ymm5, [rcx]
+ vpxor ymm14, ymm6, [rcx+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Round 2
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+64]
+ vpxor ymm12, ymm7, [r9+-32]
+ vpxor ymm13, ymm8, [r8+-32]
+ vpxor ymm14, ymm9, [rcx+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+64]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-96]
+ vpxor ymm11, ymm9, [r8+64]
+ vpxor ymm12, ymm5, [rcx+64]
+ vpxor ymm13, ymm6, [rcx+-32]
+ vpxor ymm14, ymm7, [r9]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-96]
+ vpxor ymm11, ymm7, [rcx+-96]
+ vpxor ymm12, ymm8, [r9+96]
+ vpxor ymm13, ymm9, [r8+96]
+ vpxor ymm14, ymm5, [r8]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+32]
+ vpxor ymm11, ymm5, [r9+-64]
+ vpxor ymm12, ymm6, [r8+-64]
+ vpxor ymm13, ymm7, [rcx+96]
+ vpxor ymm14, ymm8, [rcx]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+32]
+ vpxor ymm11, ymm8, [rcx+32]
+ vpxor ymm12, ymm9, [rcx+-64]
+ vpxor ymm13, ymm5, [r9+128]
+ vpxor ymm14, ymm6, [r8+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Round 3
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm2, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+64]
+ vpxor ymm12, ymm7, [r9+96]
+ vpxor ymm13, ymm8, [rcx+96]
+ vpxor ymm14, ymm9, [r8+128]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+96]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-32]
+ vpxor ymm11, ymm9, [r9]
+ vpxor ymm12, ymm5, [r8+-96]
+ vpxor ymm13, ymm6, [r9+-64]
+ vpxor ymm14, ymm7, [rcx+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+64]
+ vpxor ymm11, ymm7, [rcx+64]
+ vpxor ymm12, ymm8, [r8+96]
+ vpxor ymm13, ymm9, [rcx]
+ vpxor ymm14, ymm5, [r8+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+128]
+ vpxor ymm11, ymm5, [r9+-96]
+ vpxor ymm12, ymm6, [rcx+-96]
+ vpxor ymm13, ymm7, [r8+-64]
+ vpxor ymm14, ymm8, [r9+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-32]
+ vpxor ymm11, ymm8, [rcx+-32]
+ vpxor ymm12, ymm9, [r8]
+ vpxor ymm13, ymm5, [r9+32]
+ vpxor ymm14, ymm6, [rcx+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Round 4
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm1, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9]
+ vpxor ymm12, ymm7, [r8+96]
+ vpxor ymm13, ymm8, [r8+-64]
+ vpxor ymm14, ymm9, [rcx+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+128]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+96]
+ vpxor ymm11, ymm9, [rcx+-64]
+ vpxor ymm12, ymm5, [r9+64]
+ vpxor ymm13, ymm6, [r9+-96]
+ vpxor ymm14, ymm7, [r8]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+64]
+ vpxor ymm11, ymm7, [r8+-96]
+ vpxor ymm12, ymm8, [rcx]
+ vpxor ymm13, ymm9, [r9+128]
+ vpxor ymm14, ymm5, [r9+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+128]
+ vpxor ymm11, ymm5, [r8+-32]
+ vpxor ymm12, ymm6, [rcx+64]
+ vpxor ymm13, ymm7, [rcx+-96]
+ vpxor ymm14, ymm8, [r9+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+96]
+ vpxor ymm11, ymm8, [r9+-64]
+ vpxor ymm12, ymm9, [r8+32]
+ vpxor ymm13, ymm5, [rcx+128]
+ vpxor ymm14, ymm6, [rcx+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Round 5
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-64]
+ vpxor ymm12, ymm7, [rcx]
+ vpxor ymm13, ymm8, [rcx+-96]
+ vpxor ymm14, ymm9, [rcx+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+160]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-64]
+ vpxor ymm11, ymm9, [r8]
+ vpxor ymm12, ymm5, [r8+64]
+ vpxor ymm13, ymm6, [r8+-32]
+ vpxor ymm14, ymm7, [r8+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9]
+ vpxor ymm11, ymm7, [r9+64]
+ vpxor ymm12, ymm8, [r9+128]
+ vpxor ymm13, ymm9, [r9+32]
+ vpxor ymm14, ymm5, [r9+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+32]
+ vpxor ymm11, ymm5, [rcx+96]
+ vpxor ymm12, ymm6, [r8+-96]
+ vpxor ymm13, ymm7, [rcx+64]
+ vpxor ymm14, ymm8, [rcx+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+96]
+ vpxor ymm11, ymm8, [r9+-96]
+ vpxor ymm12, ymm9, [r9+-32]
+ vpxor ymm13, ymm5, [r8+128]
+ vpxor ymm14, ymm6, [r9+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Round 6
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm13, ymm13, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8]
+ vpxor ymm12, ymm7, [r9+128]
+ vpxor ymm13, ymm8, [rcx+64]
+ vpxor ymm14, ymm9, [r9+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+192]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-96]
+ vpxor ymm11, ymm9, [r8+32]
+ vpxor ymm12, ymm5, [r9]
+ vpxor ymm13, ymm6, [rcx+96]
+ vpxor ymm14, ymm7, [r9+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-64]
+ vpxor ymm11, ymm7, [r8+64]
+ vpxor ymm12, ymm8, [r9+32]
+ vpxor ymm13, ymm9, [rcx+128]
+ vpxor ymm14, ymm5, [r8+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-32]
+ vpxor ymm11, ymm5, [r8+-64]
+ vpxor ymm12, ymm6, [r9+64]
+ vpxor ymm13, ymm7, [r8+-96]
+ vpxor ymm14, ymm8, [r8+128]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx]
+ vpxor ymm11, ymm8, [r8+-32]
+ vpxor ymm12, ymm9, [r9+96]
+ vpxor ymm13, ymm5, [rcx+32]
+ vpxor ymm14, ymm6, [r9+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Round 7
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx+64]
+ vpxor ymm13, ymm13, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm11, ymm1, [r8+-64]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm14, ymm4, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm12, ymm2, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+32]
+ vpxor ymm12, ymm7, [r9+32]
+ vpxor ymm13, ymm8, [r8+-96]
+ vpxor ymm14, ymm9, [r9+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+224]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+64]
+ vpxor ymm11, ymm9, [r9+-32]
+ vpxor ymm12, ymm5, [rcx+-64]
+ vpxor ymm13, ymm6, [r8+-64]
+ vpxor ymm14, ymm7, [r9+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8]
+ vpxor ymm11, ymm7, [r9]
+ vpxor ymm12, ymm8, [rcx+128]
+ vpxor ymm13, ymm9, [r8+128]
+ vpxor ymm14, ymm5, [rcx]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-64]
+ vpxor ymm11, ymm5, [rcx+-96]
+ vpxor ymm12, ymm6, [r8+64]
+ vpxor ymm13, ymm7, [r9+64]
+ vpxor ymm14, ymm8, [rcx+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+128]
+ vpxor ymm11, ymm8, [rcx+96]
+ vpxor ymm12, ymm9, [r8+96]
+ vpxor ymm13, ymm5, [rcx+-32]
+ vpxor ymm14, ymm6, [r8+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Round 8
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm11, ymm11, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm14, ymm14, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-32]
+ vpxor ymm12, ymm7, [rcx+128]
+ vpxor ymm13, ymm8, [r9+64]
+ vpxor ymm14, ymm9, [r8+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+256]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+-96]
+ vpxor ymm11, ymm9, [r9+96]
+ vpxor ymm12, ymm5, [r8]
+ vpxor ymm13, ymm6, [rcx+-96]
+ vpxor ymm14, ymm7, [r8+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+32]
+ vpxor ymm11, ymm7, [rcx+-64]
+ vpxor ymm12, ymm8, [r8+128]
+ vpxor ymm13, ymm9, [rcx+32]
+ vpxor ymm14, ymm5, [r9+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-96]
+ vpxor ymm11, ymm5, [rcx+64]
+ vpxor ymm12, ymm6, [r9]
+ vpxor ymm13, ymm7, [r8+64]
+ vpxor ymm14, ymm8, [rcx+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+32]
+ vpxor ymm11, ymm8, [r8+-64]
+ vpxor ymm12, ymm9, [rcx]
+ vpxor ymm13, ymm5, [r9+-64]
+ vpxor ymm14, ymm6, [rcx+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Round 9
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm11, [rcx+64]
+ vpxor ymm12, ymm2, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+96]
+ vpxor ymm12, ymm7, [r8+128]
+ vpxor ymm13, ymm8, [r8+64]
+ vpxor ymm14, ymm9, [rcx+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+288]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+64]
+ vpxor ymm11, ymm9, [r8+96]
+ vpxor ymm12, ymm5, [r8+32]
+ vpxor ymm13, ymm6, [rcx+64]
+ vpxor ymm14, ymm7, [rcx]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-32]
+ vpxor ymm11, ymm7, [r8]
+ vpxor ymm12, ymm8, [rcx+32]
+ vpxor ymm13, ymm9, [rcx+-32]
+ vpxor ymm14, ymm5, [r9+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-32]
+ vpxor ymm11, ymm5, [r8+-96]
+ vpxor ymm12, ymm6, [rcx+-64]
+ vpxor ymm13, ymm7, [r9]
+ vpxor ymm14, ymm8, [r9+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+128]
+ vpxor ymm11, ymm8, [rcx+-96]
+ vpxor ymm12, ymm9, [r9+128]
+ vpxor ymm13, ymm5, [r9+-96]
+ vpxor ymm14, ymm6, [r8+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Round 10
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm14, ymm4, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm1, [r8+-96]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+96]
+ vpxor ymm12, ymm7, [rcx+32]
+ vpxor ymm13, ymm8, [r9]
+ vpxor ymm14, ymm9, [r8+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+320]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+64]
+ vpxor ymm11, ymm9, [rcx]
+ vpxor ymm12, ymm5, [r9+-32]
+ vpxor ymm13, ymm6, [r8+-96]
+ vpxor ymm14, ymm7, [r9+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+96]
+ vpxor ymm11, ymm7, [r8+32]
+ vpxor ymm12, ymm8, [rcx+-32]
+ vpxor ymm13, ymm9, [r9+-64]
+ vpxor ymm14, ymm5, [rcx+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+96]
+ vpxor ymm11, ymm5, [r9+64]
+ vpxor ymm12, ymm6, [r8]
+ vpxor ymm13, ymm7, [rcx+-64]
+ vpxor ymm14, ymm8, [r9+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+128]
+ vpxor ymm11, ymm8, [rcx+64]
+ vpxor ymm12, ymm9, [r9+32]
+ vpxor ymm13, ymm5, [r8+-32]
+ vpxor ymm14, ymm6, [rcx+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Round 11
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm12, ymm12, [rcx+32]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm14, ymm4, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm14, ymm14, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx]
+ vpxor ymm12, ymm7, [rcx+-32]
+ vpxor ymm13, ymm8, [rcx+-64]
+ vpxor ymm14, ymm9, [rcx+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+352]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9]
+ vpxor ymm11, ymm9, [r9+128]
+ vpxor ymm12, ymm5, [r9+96]
+ vpxor ymm13, ymm6, [r9+64]
+ vpxor ymm14, ymm7, [r9+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r9+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+96]
+ vpxor ymm11, ymm7, [r9+-32]
+ vpxor ymm12, ymm8, [r9+-64]
+ vpxor ymm13, ymm9, [r9+-96]
+ vpxor ymm14, ymm5, [r8+128]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r9+-96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-64]
+ vpxor ymm11, ymm5, [r8+64]
+ vpxor ymm12, ymm6, [r8+32]
+ vpxor ymm13, ymm7, [r8]
+ vpxor ymm14, ymm8, [r8+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+32]
+ vpxor ymm11, ymm8, [r8+-96]
+ vpxor ymm12, ymm9, [rcx+128]
+ vpxor ymm13, ymm5, [rcx+96]
+ vpxor ymm14, ymm6, [rcx+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [r8+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Round 12
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm10, ymm10, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm12, ymm12, [r8+32]
+ vpxor ymm11, ymm11, [r8+64]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm13, ymm13, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm14, ymm14, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+128]
+ vpxor ymm12, ymm7, [r9+-64]
+ vpxor ymm13, ymm8, [r8]
+ vpxor ymm14, ymm9, [rcx+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+384]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+-64]
+ vpxor ymm11, ymm9, [r9+32]
+ vpxor ymm12, ymm5, [r8+96]
+ vpxor ymm13, ymm6, [r8+64]
+ vpxor ymm14, ymm7, [rcx+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx]
+ vpxor ymm11, ymm7, [r9+96]
+ vpxor ymm12, ymm8, [r9+-96]
+ vpxor ymm13, ymm9, [r8+-32]
+ vpxor ymm14, ymm5, [rcx+32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-96]
+ vpxor ymm11, ymm5, [r9]
+ vpxor ymm12, ymm6, [r9+-32]
+ vpxor ymm13, ymm7, [r8+32]
+ vpxor ymm14, ymm8, [rcx+96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [r9+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-32]
+ vpxor ymm11, ymm8, [r9+64]
+ vpxor ymm12, ymm9, [r8+128]
+ vpxor ymm13, ymm5, [r8+-64]
+ vpxor ymm14, ymm6, [r8+-96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Round 13
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-96]
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm14, ymm4, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm14, ymm14, [rcx+128]
+ vpxor ymm13, ymm3, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm12, ymm2, [r8+96]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9+-32]
+ vpxor ymm11, ymm1, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+32]
+ vpxor ymm12, ymm7, [r9+-96]
+ vpxor ymm13, ymm8, [r8+32]
+ vpxor ymm14, ymm9, [r8+-96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+416]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8]
+ vpxor ymm11, ymm9, [rcx+128]
+ vpxor ymm12, ymm5, [rcx]
+ vpxor ymm13, ymm6, [r9]
+ vpxor ymm14, ymm7, [r8+128]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+128]
+ vpxor ymm11, ymm7, [r8+96]
+ vpxor ymm12, ymm8, [r8+-32]
+ vpxor ymm13, ymm9, [rcx+96]
+ vpxor ymm14, ymm5, [rcx+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+64]
+ vpxor ymm11, ymm5, [rcx+-64]
+ vpxor ymm12, ymm6, [r9+96]
+ vpxor ymm13, ymm7, [r9+-32]
+ vpxor ymm14, ymm8, [r8+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-64]
+ vpxor ymm11, ymm8, [r8+64]
+ vpxor ymm12, ymm9, [rcx+32]
+ vpxor ymm13, ymm5, [rcx+-96]
+ vpxor ymm14, ymm6, [r9+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Round 14
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-64]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm10, ymm10, [rcx+64]
+ vpxor ymm13, ymm3, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm11, ymm11, [r8+96]
+ vpxor ymm14, ymm14, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm12, ymm12, [r9+96]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+128]
+ vpxor ymm12, ymm7, [r8+-32]
+ vpxor ymm13, ymm8, [r9+-32]
+ vpxor ymm14, ymm9, [r9+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+448]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+32]
+ vpxor ymm11, ymm9, [r8+128]
+ vpxor ymm12, ymm5, [r9+128]
+ vpxor ymm13, ymm6, [rcx+-64]
+ vpxor ymm14, ymm7, [rcx+32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+32]
+ vpxor ymm11, ymm7, [rcx]
+ vpxor ymm12, ymm8, [rcx+96]
+ vpxor ymm13, ymm9, [r8+-64]
+ vpxor ymm14, ymm5, [r9+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r8+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+-96]
+ vpxor ymm11, ymm5, [r8]
+ vpxor ymm12, ymm6, [r8+96]
+ vpxor ymm13, ymm7, [r9+96]
+ vpxor ymm14, ymm8, [rcx+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+-96]
+ vpxor ymm11, ymm8, [r9]
+ vpxor ymm12, ymm9, [rcx+-32]
+ vpxor ymm13, ymm5, [rcx+64]
+ vpxor ymm14, ymm6, [r8+64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [r9], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Round 15
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm14, ymm14, [rcx+32]
+ vpxor ymm12, ymm2, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm10, ymm10, [r8+-96]
+ vpxor ymm13, ymm13, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm11, ymm11, [r8]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+128]
+ vpxor ymm12, ymm7, [rcx+96]
+ vpxor ymm13, ymm8, [r9+96]
+ vpxor ymm14, ymm9, [r8+64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+480]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+-32]
+ vpxor ymm11, ymm9, [rcx+32]
+ vpxor ymm12, ymm5, [r9+32]
+ vpxor ymm13, ymm6, [r8]
+ vpxor ymm14, ymm7, [rcx+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [rcx+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+128]
+ vpxor ymm11, ymm7, [r9+128]
+ vpxor ymm12, ymm8, [r8+-64]
+ vpxor ymm13, ymm9, [rcx+-96]
+ vpxor ymm14, ymm5, [r9+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-96], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+64]
+ vpxor ymm11, ymm5, [r8+32]
+ vpxor ymm12, ymm6, [rcx]
+ vpxor ymm13, ymm7, [r8+96]
+ vpxor ymm14, ymm8, [rcx+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-32]
+ vpxor ymm11, ymm8, [rcx+-64]
+ vpxor ymm12, ymm9, [r9+-64]
+ vpxor ymm13, ymm5, [r8+-96]
+ vpxor ymm14, ymm6, [r9]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [rcx+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Round 16
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm13, ymm3, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-32]
+ vpxor ymm12, ymm2, [rcx]
+ vpxor ymm11, ymm1, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm11, ymm11, [r8+32]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm10, ymm10, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+32]
+ vpxor ymm12, ymm7, [r8+-64]
+ vpxor ymm13, ymm8, [r8+96]
+ vpxor ymm14, ymm9, [r9]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+512]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+96]
+ vpxor ymm11, ymm9, [rcx+-32]
+ vpxor ymm12, ymm5, [rcx+128]
+ vpxor ymm13, ymm6, [r8+32]
+ vpxor ymm14, ymm7, [r9+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r9+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+128]
+ vpxor ymm11, ymm7, [r9+32]
+ vpxor ymm12, ymm8, [rcx+-96]
+ vpxor ymm13, ymm9, [rcx+64]
+ vpxor ymm14, ymm5, [r8+-32]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+64]
+ vpxor ymm11, ymm5, [r9+-32]
+ vpxor ymm12, ymm6, [r9+128]
+ vpxor ymm13, ymm7, [rcx]
+ vpxor ymm14, ymm8, [r8+-96]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+128], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+96]
+ vpxor ymm11, ymm8, [r8]
+ vpxor ymm12, ymm9, [r9+-96]
+ vpxor ymm13, ymm5, [r9+64]
+ vpxor ymm14, ymm6, [rcx+-64]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Round 17
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm13, ymm13, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm14, ymm4, [r8+-96]
+ vpxor ymm12, ymm12, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm13, ymm13, [r8+32]
+ vpxor ymm10, ymm10, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm14, ymm14, [r9+-64]
+ vpxor ymm11, ymm11, [r9+-32]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm10, ymm10, [r9+96]
+ vpxor ymm12, ymm12, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-32]
+ vpxor ymm12, ymm7, [rcx+-96]
+ vpxor ymm13, ymm8, [rcx]
+ vpxor ymm14, ymm9, [rcx+-64]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+544]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+96]
+ vpxor ymm11, ymm9, [r9+-64]
+ vpxor ymm12, ymm5, [r8+128]
+ vpxor ymm13, ymm6, [r9+-32]
+ vpxor ymm14, ymm7, [r9+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [r9+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+32]
+ vpxor ymm11, ymm7, [rcx+128]
+ vpxor ymm12, ymm8, [rcx+64]
+ vpxor ymm13, ymm9, [r8+-96]
+ vpxor ymm14, ymm5, [rcx+96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+-96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9]
+ vpxor ymm11, ymm5, [r9+96]
+ vpxor ymm12, ymm6, [r9+32]
+ vpxor ymm13, ymm7, [r9+128]
+ vpxor ymm14, ymm8, [r9+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r9+32], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-64]
+ vpxor ymm11, ymm8, [r8+32]
+ vpxor ymm12, ymm9, [r8+-32]
+ vpxor ymm13, ymm5, [r8+64]
+ vpxor ymm14, ymm6, [r8]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Round 18
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm12, ymm2, [rcx+-96]
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm10, ymm10, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm13, ymm13, [r8+-96]
+ vpxor ymm10, ymm10, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm14, ymm14, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm13, [r9+-32]
+ vpxor ymm10, ymm10, [r9]
+ vpxor ymm12, ymm12, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm11, ymm11, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-64]
+ vpxor ymm12, ymm7, [rcx+64]
+ vpxor ymm13, ymm8, [r9+128]
+ vpxor ymm14, ymm9, [r8]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+576]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx]
+ vpxor ymm11, ymm9, [r9+-96]
+ vpxor ymm12, ymm5, [rcx+32]
+ vpxor ymm13, ymm6, [r9+96]
+ vpxor ymm14, ymm7, [r8+-32]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r8+-32], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+-32]
+ vpxor ymm11, ymm7, [r8+128]
+ vpxor ymm12, ymm8, [r8+-96]
+ vpxor ymm13, ymm9, [r9+64]
+ vpxor ymm14, ymm5, [r8+-64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-32], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+64], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [rcx+-64]
+ vpxor ymm11, ymm5, [r8+96]
+ vpxor ymm12, ymm6, [rcx+128]
+ vpxor ymm13, ymm7, [r9+32]
+ vpxor ymm14, ymm8, [r8+64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+-96]
+ vpxor ymm11, ymm8, [r9+-32]
+ vpxor ymm12, ymm9, [rcx+96]
+ vpxor ymm13, ymm5, [r9]
+ vpxor ymm14, ymm6, [r8+32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm0
+ vmovdqu YMMWORD PTR [r9+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Round 19
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm10, ymm10, [rcx+-64]
+ vpxor ymm10, ymm10, [rcx+-32]
+ vpxor ymm10, ymm10, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm12, ymm12, [rcx+64]
+ vpxor ymm12, ymm12, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm14, ymm4, [r8+-64]
+ vpxor ymm14, ymm14, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm14, ymm14, [r8+64]
+ vpxor ymm11, ymm1, [r8+96]
+ vpxor ymm11, ymm11, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm13, ymm3, [r9+32]
+ vpxor ymm13, ymm13, [r9+64]
+ vpxor ymm13, ymm13, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r9+-96]
+ vpxor ymm12, ymm7, [r8+-96]
+ vpxor ymm13, ymm8, [r9+32]
+ vpxor ymm14, ymm9, [r8+32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+608]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+128]
+ vpxor ymm11, ymm9, [r8+-32]
+ vpxor ymm12, ymm5, [rcx+-32]
+ vpxor ymm13, ymm6, [r8+96]
+ vpxor ymm14, ymm7, [rcx+96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-64]
+ vpxor ymm11, ymm7, [rcx+32]
+ vpxor ymm12, ymm8, [r9+64]
+ vpxor ymm13, ymm9, [r8+64]
+ vpxor ymm14, ymm5, [rcx+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-64], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8]
+ vpxor ymm11, ymm5, [rcx]
+ vpxor ymm12, ymm6, [r8+128]
+ vpxor ymm13, ymm7, [rcx+128]
+ vpxor ymm14, ymm8, [r9]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [rcx+64]
+ vpxor ymm11, ymm8, [r9+96]
+ vpxor ymm12, ymm9, [r8+-64]
+ vpxor ymm13, ymm5, [rcx+-64]
+ vpxor ymm14, ymm6, [r9+-32]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [r9+96], ymm1
+ vmovdqu YMMWORD PTR [r8+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Round 20
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm11, ymm1, [rcx]
+ vpxor ymm11, ymm11, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+96]
+ vpxor ymm13, ymm3, [rcx+128]
+ vpxor ymm12, ymm12, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm13, ymm13, [r8+64]
+ vpxor ymm13, ymm13, [r8+96]
+ vpxor ymm12, ymm12, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm10, ymm10, [r9+-64]
+ vpxor ymm14, ymm14, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm10, ymm10, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-32]
+ vpxor ymm12, ymm7, [r9+64]
+ vpxor ymm13, ymm8, [rcx+128]
+ vpxor ymm14, ymm9, [r9+-32]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+640]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r9+32]
+ vpxor ymm11, ymm9, [rcx+96]
+ vpxor ymm12, ymm5, [r9+-64]
+ vpxor ymm13, ymm6, [rcx]
+ vpxor ymm14, ymm7, [r8+-64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx], ymm3
+ vmovdqu YMMWORD PTR [r8+-64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r9+-96]
+ vpxor ymm11, ymm7, [rcx+-32]
+ vpxor ymm12, ymm8, [r8+64]
+ vpxor ymm13, ymm9, [r9]
+ vpxor ymm14, ymm5, [rcx+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-96], ymm0
+ vmovdqu YMMWORD PTR [rcx+-32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r9], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+32]
+ vpxor ymm11, ymm5, [r9+128]
+ vpxor ymm12, ymm6, [rcx+32]
+ vpxor ymm13, ymm7, [r8+128]
+ vpxor ymm14, ymm8, [rcx+-64]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+32], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-64], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+-96]
+ vpxor ymm11, ymm8, [r8+96]
+ vpxor ymm12, ymm9, [rcx+-96]
+ vpxor ymm13, ymm5, [r8]
+ vpxor ymm14, ymm6, [r9+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-96], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-96], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Round 21
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-64]
+ vpxor ymm11, ymm1, [rcx+-32]
+ vpxor ymm13, ymm3, [rcx]
+ vpxor ymm12, ymm2, [rcx+32]
+ vpxor ymm14, ymm14, [rcx+64]
+ vpxor ymm11, ymm11, [rcx+96]
+ vpxor ymm13, ymm13, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-64]
+ vpxor ymm11, ymm11, [r8+-32]
+ vpxor ymm10, ymm10, [r8+32]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm10, ymm10, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm14, ymm14, [r9+-32]
+ vpxor ymm13, ymm13, [r9]
+ vpxor ymm10, ymm10, [r9+32]
+ vpxor ymm12, ymm12, [r9+64]
+ vpxor ymm11, ymm11, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+96]
+ vpxor ymm12, ymm7, [r8+64]
+ vpxor ymm13, ymm8, [r8+128]
+ vpxor ymm14, ymm9, [r9+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+672]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm3
+ vmovdqu YMMWORD PTR [r9+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+128]
+ vpxor ymm11, ymm9, [r8+-64]
+ vpxor ymm12, ymm5, [r9+-96]
+ vpxor ymm13, ymm6, [r9+128]
+ vpxor ymm14, ymm7, [rcx+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm3
+ vmovdqu YMMWORD PTR [rcx+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-32]
+ vpxor ymm11, ymm7, [r9+-64]
+ vpxor ymm12, ymm8, [r9]
+ vpxor ymm13, ymm9, [rcx+-64]
+ vpxor ymm14, ymm5, [r8+-96]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+-64], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+-32]
+ vpxor ymm11, ymm5, [r9+32]
+ vpxor ymm12, ymm6, [rcx+-32]
+ vpxor ymm13, ymm7, [rcx+32]
+ vpxor ymm14, ymm8, [r8]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+-32], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+-32], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9+64]
+ vpxor ymm11, ymm8, [rcx]
+ vpxor ymm12, ymm9, [rcx+64]
+ vpxor ymm13, ymm5, [r8+32]
+ vpxor ymm14, ymm6, [r8+96]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+64], ymm0
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Round 22
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm14, ymm4, [rcx+-96]
+ vpxor ymm13, ymm3, [rcx+-64]
+ vpxor ymm12, ymm2, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm11, ymm1, [rcx+96]
+ vpxor ymm10, ymm10, [rcx+128]
+ vpxor ymm14, ymm14, [r8+-96]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm10, ymm10, [r8+-32]
+ vpxor ymm14, ymm14, [r8]
+ vpxor ymm12, ymm12, [r8+64]
+ vpxor ymm13, ymm13, [r8+128]
+ vpxor ymm12, ymm12, [r9+-96]
+ vpxor ymm11, ymm11, [r9+-64]
+ vpxor ymm10, ymm10, [r9+-32]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm11, ymm11, [r9+32]
+ vpxor ymm14, ymm14, [r9+96]
+ vpxor ymm13, ymm13, [r9+128]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [r8+-64]
+ vpxor ymm12, ymm7, [r9]
+ vpxor ymm13, ymm8, [rcx+32]
+ vpxor ymm14, ymm9, [r8+96]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+704]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm1
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [r8+96], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [r8+128]
+ vpxor ymm11, ymm9, [rcx+-96]
+ vpxor ymm12, ymm5, [r8+-32]
+ vpxor ymm13, ymm6, [r9+32]
+ vpxor ymm14, ymm7, [rcx+64]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [r8+-32], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [rcx+96]
+ vpxor ymm11, ymm7, [r9+-96]
+ vpxor ymm12, ymm8, [rcx+-64]
+ vpxor ymm13, ymm9, [r8]
+ vpxor ymm14, ymm5, [r9+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [r9+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [r8], ymm3
+ vmovdqu YMMWORD PTR [r9+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r9+96]
+ vpxor ymm11, ymm5, [rcx+128]
+ vpxor ymm12, ymm6, [r9+-64]
+ vpxor ymm13, ymm7, [rcx+-32]
+ vpxor ymm14, ymm8, [r8+32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [r8+32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r8+64]
+ vpxor ymm11, ymm8, [r9+128]
+ vpxor ymm12, ymm9, [r8+-96]
+ vpxor ymm13, ymm5, [r9+-32]
+ vpxor ymm14, ymm6, [rcx]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r9+128], ymm1
+ vmovdqu YMMWORD PTR [r8+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Round 23
+ ; Calc b[0..4]
+ vpxor ymm10, ymm0, ymm15
+ vpxor ymm11, ymm1, [rcx+-96]
+ vpxor ymm12, ymm2, [rcx+-64]
+ vpxor ymm13, ymm3, [rcx+-32]
+ vpxor ymm13, ymm13, [rcx+32]
+ vpxor ymm14, ymm4, [rcx+64]
+ vpxor ymm10, ymm10, [rcx+96]
+ vpxor ymm11, ymm11, [rcx+128]
+ vpxor ymm11, ymm11, [r8+-64]
+ vpxor ymm12, ymm12, [r8+-32]
+ vpxor ymm13, ymm13, [r8]
+ vpxor ymm14, ymm14, [r8+32]
+ vpxor ymm14, ymm14, [r8+96]
+ vpxor ymm10, ymm10, [r8+128]
+ vpxor ymm11, ymm11, [r9+-96]
+ vpxor ymm12, ymm12, [r9+-64]
+ vpxor ymm12, ymm12, [r9]
+ vpxor ymm13, ymm13, [r9+32]
+ vpxor ymm14, ymm14, [r9+64]
+ vpxor ymm10, ymm10, [r9+96]
+ ; Calc t[0..4]
+ vpsrlq ymm0, ymm11, 63
+ vpsrlq ymm1, ymm12, 63
+ vpsrlq ymm2, ymm13, 63
+ vpsrlq ymm3, ymm14, 63
+ vpsrlq ymm4, ymm10, 63
+ vpaddq ymm5, ymm11, ymm11
+ vpaddq ymm6, ymm12, ymm12
+ vpaddq ymm7, ymm13, ymm13
+ vpaddq ymm8, ymm14, ymm14
+ vpaddq ymm9, ymm10, ymm10
+ vpor ymm5, ymm5, ymm0
+ vpor ymm6, ymm6, ymm1
+ vpor ymm7, ymm7, ymm2
+ vpor ymm8, ymm8, ymm3
+ vpor ymm9, ymm9, ymm4
+ vpxor ymm5, ymm5, ymm14
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vpxor ymm8, ymm8, ymm12
+ vpxor ymm9, ymm9, ymm13
+ ; Row Mix
+ ; Row 0
+ vpxor ymm10, ymm5, ymm15
+ vpxor ymm11, ymm6, [rcx+-96]
+ vpxor ymm12, ymm7, [rcx+-64]
+ vpxor ymm13, ymm8, [rcx+-32]
+ vpxor ymm14, ymm9, [rcx]
+ vpsrlq ymm0, ymm11, 20
+ vpsrlq ymm1, ymm12, 21
+ vpsrlq ymm2, ymm13, 43
+ vpsrlq ymm3, ymm14, 50
+ vpsllq ymm11, ymm11, 44
+ vpsllq ymm12, ymm12, 43
+ vpsllq ymm13, ymm13, 21
+ vpsllq ymm14, ymm14, 14
+ vpor ymm11, ymm11, ymm0
+ vpor ymm12, ymm12, ymm1
+ vpor ymm13, ymm13, ymm2
+ vpor ymm14, ymm14, ymm3
+ vpandn ymm15, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm15, ymm15, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ ; XOR in constant
+ vpxor ymm15, ymm15, [rax+736]
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+-96], ymm1
+ vmovdqu YMMWORD PTR [rcx+-64], ymm2
+ vmovdqu YMMWORD PTR [rcx+-32], ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm4
+ ; Row 1
+ vpxor ymm10, ymm8, [rcx+32]
+ vpxor ymm11, ymm9, [rcx+64]
+ vpxor ymm12, ymm5, [rcx+96]
+ vpxor ymm13, ymm6, [rcx+128]
+ vpxor ymm14, ymm7, [r8+-96]
+ vpsrlq ymm0, ymm10, 36
+ vpsrlq ymm1, ymm11, 44
+ vpsrlq ymm2, ymm12, 61
+ vpsrlq ymm3, ymm13, 19
+ vpsrlq ymm4, ymm14, 3
+ vpsllq ymm10, ymm10, 28
+ vpsllq ymm11, ymm11, 20
+ vpsllq ymm12, ymm12, 3
+ vpsllq ymm13, ymm13, 45
+ vpsllq ymm14, ymm14, 61
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+128], ymm3
+ vmovdqu YMMWORD PTR [r8+-96], ymm4
+ ; Row 2
+ vpxor ymm10, ymm6, [r8+-64]
+ vpxor ymm11, ymm7, [r8+-32]
+ vpxor ymm12, ymm8, [r8]
+ vpxor ymm13, ymm9, [r8+32]
+ vpxor ymm14, ymm5, [r8+64]
+ vpsrlq ymm0, ymm10, 63
+ vpsrlq ymm1, ymm11, 58
+ vpsrlq ymm2, ymm12, 39
+ vpsrlq ymm3, ymm13, 56
+ vpsrlq ymm4, ymm14, 46
+ vpaddq ymm10, ymm10, ymm10
+ vpsllq ymm11, ymm11, 6
+ vpsllq ymm12, ymm12, 25
+ vpsllq ymm13, ymm13, 8
+ vpsllq ymm14, ymm14, 18
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+-64], ymm0
+ vmovdqu YMMWORD PTR [r8+-32], ymm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ vmovdqu YMMWORD PTR [r8+32], ymm3
+ vmovdqu YMMWORD PTR [r8+64], ymm4
+ ; Row 3
+ vpxor ymm10, ymm9, [r8+96]
+ vpxor ymm11, ymm5, [r8+128]
+ vpxor ymm12, ymm6, [r9+-96]
+ vpxor ymm13, ymm7, [r9+-64]
+ vpxor ymm14, ymm8, [r9+-32]
+ vpsrlq ymm0, ymm10, 37
+ vpsrlq ymm1, ymm11, 28
+ vpsrlq ymm2, ymm12, 54
+ vpsrlq ymm3, ymm13, 49
+ vpsrlq ymm4, ymm14, 8
+ vpsllq ymm10, ymm10, 27
+ vpsllq ymm11, ymm11, 36
+ vpsllq ymm12, ymm12, 10
+ vpsllq ymm13, ymm13, 15
+ vpsllq ymm14, ymm14, 56
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r8+96], ymm0
+ vmovdqu YMMWORD PTR [r8+128], ymm1
+ vmovdqu YMMWORD PTR [r9+-96], ymm2
+ vmovdqu YMMWORD PTR [r9+-64], ymm3
+ vmovdqu YMMWORD PTR [r9+-32], ymm4
+ ; Row 4
+ vpxor ymm10, ymm7, [r9]
+ vpxor ymm11, ymm8, [r9+32]
+ vpxor ymm12, ymm9, [r9+64]
+ vpxor ymm13, ymm5, [r9+96]
+ vpxor ymm14, ymm6, [r9+128]
+ vpsrlq ymm0, ymm10, 2
+ vpsrlq ymm1, ymm11, 9
+ vpsrlq ymm2, ymm12, 25
+ vpsrlq ymm3, ymm13, 23
+ vpsrlq ymm4, ymm14, 62
+ vpsllq ymm10, ymm10, 62
+ vpsllq ymm11, ymm11, 55
+ vpsllq ymm12, ymm12, 39
+ vpsllq ymm13, ymm13, 41
+ vpsllq ymm14, ymm14, 2
+ vpor ymm10, ymm10, ymm0
+ vpor ymm11, ymm11, ymm1
+ vpor ymm12, ymm12, ymm2
+ vpor ymm13, ymm13, ymm3
+ vpor ymm14, ymm14, ymm4
+ vpandn ymm0, ymm11, ymm12
+ vpandn ymm1, ymm12, ymm13
+ vpandn ymm2, ymm13, ymm14
+ vpandn ymm3, ymm14, ymm10
+ vpandn ymm4, ymm10, ymm11
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vpxor ymm4, ymm4, ymm14
+ vmovdqu YMMWORD PTR [r9], ymm0
+ vmovdqu YMMWORD PTR [r9+32], ymm1
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm3
+ vmovdqu YMMWORD PTR [r9+128], ymm4
+ sub rcx, 128
+ vmovdqu YMMWORD PTR [rcx], ymm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+sha3_256_blocksx4_seed_64_avx2 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfcrypt/src/sha512_asm.asm b/wolfcrypt/src/sha512_asm.asm
new file mode 100644
index 00000000000..07cebd52a70
--- /dev/null
+++ b/wolfcrypt/src/sha512_asm.asm
@@ -0,0 +1,10774 @@
+; /* sha512_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+IFDEF HAVE_INTEL_AVX1
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx1_sha512_k QWORD L_avx1_sha512_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_avx1_sha512_flip_mask QWORD L_avx1_sha512_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX1 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ sub rsp, 280
+ vmovdqu OWORD PTR [rsp+136], xmm6
+ vmovdqu OWORD PTR [rsp+152], xmm7
+ vmovdqu OWORD PTR [rsp+168], xmm8
+ vmovdqu OWORD PTR [rsp+184], xmm9
+ vmovdqu OWORD PTR [rsp+200], xmm10
+ vmovdqu OWORD PTR [rsp+216], xmm11
+ vmovdqu OWORD PTR [rsp+232], xmm13
+ vmovdqu OWORD PTR [rsp+248], xmm12
+ vmovdqu OWORD PTR [rsp+264], xmm14
+ lea rax, QWORD PTR [rdi+64]
+ vmovdqa xmm14, OWORD PTR L_avx1_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ vmovdqu xmm0, OWORD PTR [rax]
+ vmovdqu xmm1, OWORD PTR [rax+16]
+ vpshufb xmm0, xmm0, xmm14
+ vpshufb xmm1, xmm1, xmm14
+ vmovdqu xmm2, OWORD PTR [rax+32]
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vpshufb xmm2, xmm2, xmm14
+ vpshufb xmm3, xmm3, xmm14
+ vmovdqu xmm4, OWORD PTR [rax+64]
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vpshufb xmm4, xmm4, xmm14
+ vpshufb xmm5, xmm5, xmm14
+ vmovdqu xmm6, OWORD PTR [rax+96]
+ vmovdqu xmm7, OWORD PTR [rax+112]
+ vpshufb xmm6, xmm6, xmm14
+ vpshufb xmm7, xmm7, xmm14
+ mov DWORD PTR [rsp+128], 4
+ mov rsi, QWORD PTR [ptr_L_avx1_sha512_k]
+ mov rbx, r9
+ mov rax, r12
+ xor rbx, r10
+ ; Start of 16 rounds
+L_transform_sha512_avx1_start:
+ vpaddq xmm8, xmm0, [rsi]
+ vpaddq xmm9, xmm1, [rsi+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rsi+32]
+ vpaddq xmm9, xmm3, [rsi+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rsi+64]
+ vpaddq xmm9, xmm5, [rsi+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rsi+96]
+ vpaddq xmm9, xmm7, [rsi+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ add rsi, 128
+ ; msg_sched: 0-1
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm1, xmm0, 8
+ vpalignr xmm13, xmm5, xmm4, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm0, xmm13, xmm0
+ ; rnd_0: 10 - 11
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm0, xmm8, xmm0
+ ; rnd_1: 1 - 1
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ vpsrlq xmm8, xmm7, 19
+ vpsllq xmm9, xmm7, 45
+ ; rnd_1: 2 - 3
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq xmm10, xmm7, 61
+ vpsllq xmm11, xmm7, 3
+ ; rnd_1: 4 - 6
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm7, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq xmm0, xmm8, xmm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 2-3
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm2, xmm1, 8
+ vpalignr xmm13, xmm6, xmm5, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm1, xmm13, xmm1
+ ; rnd_0: 10 - 11
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm1, xmm8, xmm1
+ ; rnd_1: 1 - 1
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ vpsrlq xmm8, xmm0, 19
+ vpsllq xmm9, xmm0, 45
+ ; rnd_1: 2 - 3
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq xmm10, xmm0, 61
+ vpsllq xmm11, xmm0, 3
+ ; rnd_1: 4 - 6
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm0, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq xmm1, xmm8, xmm1
+ ; msg_sched done: 2-3
+ ; msg_sched: 4-5
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm3, xmm2, 8
+ vpalignr xmm13, xmm7, xmm6, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm2, xmm13, xmm2
+ ; rnd_0: 10 - 11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm2, xmm8, xmm2
+ ; rnd_1: 1 - 1
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ vpsrlq xmm8, xmm1, 19
+ vpsllq xmm9, xmm1, 45
+ ; rnd_1: 2 - 3
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq xmm10, xmm1, 61
+ vpsllq xmm11, xmm1, 3
+ ; rnd_1: 4 - 6
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm1, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq xmm2, xmm8, xmm2
+ ; msg_sched done: 4-5
+ ; msg_sched: 6-7
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm4, xmm3, 8
+ vpalignr xmm13, xmm0, xmm7, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm3, xmm13, xmm3
+ ; rnd_0: 10 - 11
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm3, xmm8, xmm3
+ ; rnd_1: 1 - 1
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ vpsrlq xmm8, xmm2, 19
+ vpsllq xmm9, xmm2, 45
+ ; rnd_1: 2 - 3
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq xmm10, xmm2, 61
+ vpsllq xmm11, xmm2, 3
+ ; rnd_1: 4 - 6
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm2, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq xmm3, xmm8, xmm3
+ ; msg_sched done: 6-7
+ ; msg_sched: 8-9
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm5, xmm4, 8
+ vpalignr xmm13, xmm1, xmm0, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm4, xmm13, xmm4
+ ; rnd_0: 10 - 11
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm4, xmm8, xmm4
+ ; rnd_1: 1 - 1
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ vpsrlq xmm8, xmm3, 19
+ vpsllq xmm9, xmm3, 45
+ ; rnd_1: 2 - 3
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq xmm10, xmm3, 61
+ vpsllq xmm11, xmm3, 3
+ ; rnd_1: 4 - 6
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm3, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq xmm4, xmm8, xmm4
+ ; msg_sched done: 8-9
+ ; msg_sched: 10-11
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm6, xmm5, 8
+ vpalignr xmm13, xmm2, xmm1, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm5, xmm13, xmm5
+ ; rnd_0: 10 - 11
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm5, xmm8, xmm5
+ ; rnd_1: 1 - 1
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ vpsrlq xmm8, xmm4, 19
+ vpsllq xmm9, xmm4, 45
+ ; rnd_1: 2 - 3
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq xmm10, xmm4, 61
+ vpsllq xmm11, xmm4, 3
+ ; rnd_1: 4 - 6
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm4, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq xmm5, xmm8, xmm5
+ ; msg_sched done: 10-11
+ ; msg_sched: 12-13
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm7, xmm6, 8
+ vpalignr xmm13, xmm3, xmm2, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm6, xmm13, xmm6
+ ; rnd_0: 10 - 11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm6, xmm8, xmm6
+ ; rnd_1: 1 - 1
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ vpsrlq xmm8, xmm5, 19
+ vpsllq xmm9, xmm5, 45
+ ; rnd_1: 2 - 3
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq xmm10, xmm5, 61
+ vpsllq xmm11, xmm5, 3
+ ; rnd_1: 4 - 6
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm5, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq xmm6, xmm8, xmm6
+ ; msg_sched done: 12-13
+ ; msg_sched: 14-15
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm0, xmm7, 8
+ vpalignr xmm13, xmm4, xmm3, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm7, xmm13, xmm7
+ ; rnd_0: 10 - 11
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm7, xmm8, xmm7
+ ; rnd_1: 1 - 1
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ vpsrlq xmm8, xmm6, 19
+ vpsllq xmm9, xmm6, 45
+ ; rnd_1: 2 - 3
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq xmm10, xmm6, 61
+ vpsllq xmm11, xmm6, 3
+ ; rnd_1: 4 - 6
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm6, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq xmm7, xmm8, xmm7
+ ; msg_sched done: 14-15
+ sub DWORD PTR [rsp+128], 1
+ jne L_transform_sha512_avx1_start
+ vpaddq xmm8, xmm0, [rsi]
+ vpaddq xmm9, xmm1, [rsi+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rsi+32]
+ vpaddq xmm9, xmm3, [rsi+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rsi+64]
+ vpaddq xmm9, xmm5, [rsi+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rsi+96]
+ vpaddq xmm9, xmm7, [rsi+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ ; rnd_all_2: 0-1
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ; rnd_all_2: 2-3
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ; rnd_all_2: 4-5
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ; rnd_all_2: 6-7
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ; rnd_all_2: 8-9
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ; rnd_all_2: 10-11
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ; rnd_all_2: 12-13
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ; rnd_all_2: 14-15
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ add QWORD PTR [rdi], r8
+ add QWORD PTR [rdi+8], r9
+ add QWORD PTR [rdi+16], r10
+ add QWORD PTR [rdi+24], r11
+ add QWORD PTR [rdi+32], r12
+ add QWORD PTR [rdi+40], r13
+ add QWORD PTR [rdi+48], r14
+ add QWORD PTR [rdi+56], r15
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+136]
+ vmovdqu xmm7, OWORD PTR [rsp+152]
+ vmovdqu xmm8, OWORD PTR [rsp+168]
+ vmovdqu xmm9, OWORD PTR [rsp+184]
+ vmovdqu xmm10, OWORD PTR [rsp+200]
+ vmovdqu xmm11, OWORD PTR [rsp+216]
+ vmovdqu xmm13, OWORD PTR [rsp+232]
+ vmovdqu xmm12, OWORD PTR [rsp+248]
+ vmovdqu xmm14, OWORD PTR [rsp+264]
+ add rsp, 280
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX1_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rbp, rdx
+ sub rsp, 288
+ vmovdqu OWORD PTR [rsp+144], xmm6
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ vmovdqu OWORD PTR [rsp+176], xmm8
+ vmovdqu OWORD PTR [rsp+192], xmm9
+ vmovdqu OWORD PTR [rsp+208], xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm13
+ vmovdqu OWORD PTR [rsp+256], xmm12
+ vmovdqu OWORD PTR [rsp+272], xmm14
+ mov rsi, QWORD PTR [rdi+224]
+ mov rdx, QWORD PTR [ptr_L_avx1_sha512_k]
+ vmovdqa xmm14, OWORD PTR L_avx1_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ ; Start of loop processing a block
+L_sha512_len_avx1_begin:
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm14
+ vpshufb xmm1, xmm1, xmm14
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm14
+ vpshufb xmm3, xmm3, xmm14
+ vmovdqu xmm4, OWORD PTR [rsi+64]
+ vmovdqu xmm5, OWORD PTR [rsi+80]
+ vpshufb xmm4, xmm4, xmm14
+ vpshufb xmm5, xmm5, xmm14
+ vmovdqu xmm6, OWORD PTR [rsi+96]
+ vmovdqu xmm7, OWORD PTR [rsi+112]
+ vpshufb xmm6, xmm6, xmm14
+ vpshufb xmm7, xmm7, xmm14
+ mov DWORD PTR [rsp+128], 4
+ mov rbx, r9
+ mov rax, r12
+ xor rbx, r10
+ vpaddq xmm8, xmm0, [rdx]
+ vpaddq xmm9, xmm1, [rdx+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rdx+32]
+ vpaddq xmm9, xmm3, [rdx+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rdx+64]
+ vpaddq xmm9, xmm5, [rdx+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rdx+96]
+ vpaddq xmm9, xmm7, [rdx+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ ; Start of 16 rounds
+L_sha512_len_avx1_start:
+ add rdx, 128
+ mov QWORD PTR [rsp+136], rdx
+ ; msg_sched: 0-1
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm1, xmm0, 8
+ vpalignr xmm13, xmm5, xmm4, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm0, xmm13, xmm0
+ ; rnd_0: 10 - 11
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm0, xmm8, xmm0
+ ; rnd_1: 1 - 1
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ vpsrlq xmm8, xmm7, 19
+ vpsllq xmm9, xmm7, 45
+ ; rnd_1: 2 - 3
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq xmm10, xmm7, 61
+ vpsllq xmm11, xmm7, 3
+ ; rnd_1: 4 - 6
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm7, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq xmm0, xmm8, xmm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 2-3
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm2, xmm1, 8
+ vpalignr xmm13, xmm6, xmm5, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm1, xmm13, xmm1
+ ; rnd_0: 10 - 11
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm1, xmm8, xmm1
+ ; rnd_1: 1 - 1
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ vpsrlq xmm8, xmm0, 19
+ vpsllq xmm9, xmm0, 45
+ ; rnd_1: 2 - 3
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq xmm10, xmm0, 61
+ vpsllq xmm11, xmm0, 3
+ ; rnd_1: 4 - 6
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm0, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq xmm1, xmm8, xmm1
+ ; msg_sched done: 2-3
+ ; msg_sched: 4-5
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm3, xmm2, 8
+ vpalignr xmm13, xmm7, xmm6, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm2, xmm13, xmm2
+ ; rnd_0: 10 - 11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm2, xmm8, xmm2
+ ; rnd_1: 1 - 1
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ vpsrlq xmm8, xmm1, 19
+ vpsllq xmm9, xmm1, 45
+ ; rnd_1: 2 - 3
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq xmm10, xmm1, 61
+ vpsllq xmm11, xmm1, 3
+ ; rnd_1: 4 - 6
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm1, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq xmm2, xmm8, xmm2
+ ; msg_sched done: 4-5
+ ; msg_sched: 6-7
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm4, xmm3, 8
+ vpalignr xmm13, xmm0, xmm7, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm3, xmm13, xmm3
+ ; rnd_0: 10 - 11
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm3, xmm8, xmm3
+ ; rnd_1: 1 - 1
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ vpsrlq xmm8, xmm2, 19
+ vpsllq xmm9, xmm2, 45
+ ; rnd_1: 2 - 3
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq xmm10, xmm2, 61
+ vpsllq xmm11, xmm2, 3
+ ; rnd_1: 4 - 6
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm2, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq xmm3, xmm8, xmm3
+ ; msg_sched done: 6-7
+ ; msg_sched: 8-9
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm5, xmm4, 8
+ vpalignr xmm13, xmm1, xmm0, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm4, xmm13, xmm4
+ ; rnd_0: 10 - 11
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm4, xmm8, xmm4
+ ; rnd_1: 1 - 1
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ vpsrlq xmm8, xmm3, 19
+ vpsllq xmm9, xmm3, 45
+ ; rnd_1: 2 - 3
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq xmm10, xmm3, 61
+ vpsllq xmm11, xmm3, 3
+ ; rnd_1: 4 - 6
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm3, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq xmm4, xmm8, xmm4
+ ; msg_sched done: 8-9
+ ; msg_sched: 10-11
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm6, xmm5, 8
+ vpalignr xmm13, xmm2, xmm1, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm5, xmm13, xmm5
+ ; rnd_0: 10 - 11
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm5, xmm8, xmm5
+ ; rnd_1: 1 - 1
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ vpsrlq xmm8, xmm4, 19
+ vpsllq xmm9, xmm4, 45
+ ; rnd_1: 2 - 3
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq xmm10, xmm4, 61
+ vpsllq xmm11, xmm4, 3
+ ; rnd_1: 4 - 6
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm4, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq xmm5, xmm8, xmm5
+ ; msg_sched done: 10-11
+ ; msg_sched: 12-13
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm7, xmm6, 8
+ vpalignr xmm13, xmm3, xmm2, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm6, xmm13, xmm6
+ ; rnd_0: 10 - 11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm6, xmm8, xmm6
+ ; rnd_1: 1 - 1
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ vpsrlq xmm8, xmm5, 19
+ vpsllq xmm9, xmm5, 45
+ ; rnd_1: 2 - 3
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq xmm10, xmm5, 61
+ vpsllq xmm11, xmm5, 3
+ ; rnd_1: 4 - 6
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm5, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq xmm6, xmm8, xmm6
+ ; msg_sched done: 12-13
+ ; msg_sched: 14-15
+ ; rnd_0: 0 - 0
+ ror rax, 23
+ vpalignr xmm12, xmm0, xmm7, 8
+ vpalignr xmm13, xmm4, xmm3, 8
+ ; rnd_0: 1 - 1
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 3
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 4 - 5
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 6 - 7
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 8 - 9
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm7, xmm13, xmm7
+ ; rnd_0: 10 - 11
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 0
+ ror rax, 23
+ vpaddq xmm7, xmm8, xmm7
+ ; rnd_1: 1 - 1
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ vpsrlq xmm8, xmm6, 19
+ vpsllq xmm9, xmm6, 45
+ ; rnd_1: 2 - 3
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq xmm10, xmm6, 61
+ vpsllq xmm11, xmm6, 3
+ ; rnd_1: 4 - 6
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 7 - 8
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm6, 6
+ ; rnd_1: 9 - 10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 11 - 11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq xmm7, xmm8, xmm7
+ ; msg_sched done: 14-15
+ mov rdx, QWORD PTR [rsp+136]
+ vpaddq xmm8, xmm0, [rdx]
+ vpaddq xmm9, xmm1, [rdx+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rdx+32]
+ vpaddq xmm9, xmm3, [rdx+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rdx+64]
+ vpaddq xmm9, xmm5, [rdx+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rdx+96]
+ vpaddq xmm9, xmm7, [rdx+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ sub DWORD PTR [rsp+128], 1
+ jne L_sha512_len_avx1_start
+ ; rnd_all_2: 0-1
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ; rnd_all_2: 2-3
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ; rnd_all_2: 4-5
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ; rnd_all_2: 6-7
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ; rnd_all_2: 8-9
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ; rnd_all_2: 10-11
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ; rnd_all_2: 12-13
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ; rnd_all_2: 14-15
+ ; rnd_0: 0 - 11
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ; rnd_1: 0 - 11
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov rdx, QWORD PTR [ptr_L_avx1_sha512_k]
+ add rsi, 128
+ sub ebp, 128
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ jnz L_sha512_len_avx1_begin
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+144]
+ vmovdqu xmm7, OWORD PTR [rsp+160]
+ vmovdqu xmm8, OWORD PTR [rsp+176]
+ vmovdqu xmm9, OWORD PTR [rsp+192]
+ vmovdqu xmm10, OWORD PTR [rsp+208]
+ vmovdqu xmm11, OWORD PTR [rsp+224]
+ vmovdqu xmm13, OWORD PTR [rsp+240]
+ vmovdqu xmm12, OWORD PTR [rsp+256]
+ vmovdqu xmm14, OWORD PTR [rsp+272]
+ add rsp, 288
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX1_Len ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx1_rorx_sha512_k QWORD L_avx1_rorx_sha512_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_rorx_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_avx1_rorx_sha512_flip_mask QWORD L_avx1_rorx_sha512_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX1_RORX PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ sub rsp, 280
+ vmovdqu OWORD PTR [rsp+136], xmm6
+ vmovdqu OWORD PTR [rsp+152], xmm7
+ vmovdqu OWORD PTR [rsp+168], xmm8
+ vmovdqu OWORD PTR [rsp+184], xmm9
+ vmovdqu OWORD PTR [rsp+200], xmm10
+ vmovdqu OWORD PTR [rsp+216], xmm11
+ vmovdqu OWORD PTR [rsp+232], xmm13
+ vmovdqu OWORD PTR [rsp+248], xmm12
+ vmovdqu OWORD PTR [rsp+264], xmm14
+ lea rax, QWORD PTR [rdi+64]
+ vmovdqa xmm14, OWORD PTR L_avx1_rorx_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ vmovdqu xmm0, OWORD PTR [rax]
+ vmovdqu xmm1, OWORD PTR [rax+16]
+ vpshufb xmm0, xmm0, xmm14
+ vpshufb xmm1, xmm1, xmm14
+ vmovdqu xmm2, OWORD PTR [rax+32]
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vpshufb xmm2, xmm2, xmm14
+ vpshufb xmm3, xmm3, xmm14
+ vmovdqu xmm4, OWORD PTR [rax+64]
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vpshufb xmm4, xmm4, xmm14
+ vpshufb xmm5, xmm5, xmm14
+ vmovdqu xmm6, OWORD PTR [rax+96]
+ vmovdqu xmm7, OWORD PTR [rax+112]
+ vpshufb xmm6, xmm6, xmm14
+ vpshufb xmm7, xmm7, xmm14
+ mov DWORD PTR [rsp+128], 4
+ mov rsi, QWORD PTR [ptr_L_avx1_rorx_sha512_k]
+ mov rbx, r9
+ xor rdx, rdx
+ xor rbx, r10
+ vpaddq xmm8, xmm0, [rsi]
+ vpaddq xmm9, xmm1, [rsi+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rsi+32]
+ vpaddq xmm9, xmm3, [rsi+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rsi+64]
+ vpaddq xmm9, xmm5, [rsi+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rsi+96]
+ vpaddq xmm9, xmm7, [rsi+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ ; Start of 16 rounds
+L_transform_sha512_avx1_rorx_start:
+ add rsi, 128
+ ; msg_sched: 0-1
+ ; rnd_0: 0 - 0
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr xmm12, xmm1, xmm0, 8
+ vpalignr xmm13, xmm5, xmm4, 8
+ ; rnd_0: 1 - 1
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm0, xmm13, xmm0
+ ; rnd_0: 6 - 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq xmm0, xmm8, xmm0
+ ; rnd_1: 0 - 0
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq xmm8, xmm7, 19
+ vpsllq xmm9, xmm7, 45
+ ; rnd_1: 1 - 1
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq xmm10, xmm7, 61
+ vpsllq xmm11, xmm7, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm7, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq xmm0, xmm8, xmm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 2-3
+ ; rnd_0: 0 - 0
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr xmm12, xmm2, xmm1, 8
+ vpalignr xmm13, xmm6, xmm5, 8
+ ; rnd_0: 1 - 1
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm1, xmm13, xmm1
+ ; rnd_0: 6 - 7
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq xmm1, xmm8, xmm1
+ ; rnd_1: 0 - 0
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq xmm8, xmm0, 19
+ vpsllq xmm9, xmm0, 45
+ ; rnd_1: 1 - 1
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq xmm10, xmm0, 61
+ vpsllq xmm11, xmm0, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm0, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq xmm1, xmm8, xmm1
+ ; msg_sched done: 2-3
+ ; msg_sched: 4-5
+ ; rnd_0: 0 - 0
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr xmm12, xmm3, xmm2, 8
+ vpalignr xmm13, xmm7, xmm6, 8
+ ; rnd_0: 1 - 1
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm2, xmm13, xmm2
+ ; rnd_0: 6 - 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq xmm2, xmm8, xmm2
+ ; rnd_1: 0 - 0
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq xmm8, xmm1, 19
+ vpsllq xmm9, xmm1, 45
+ ; rnd_1: 1 - 1
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq xmm10, xmm1, 61
+ vpsllq xmm11, xmm1, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm1, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq xmm2, xmm8, xmm2
+ ; msg_sched done: 4-5
+ ; msg_sched: 6-7
+ ; rnd_0: 0 - 0
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr xmm12, xmm4, xmm3, 8
+ vpalignr xmm13, xmm0, xmm7, 8
+ ; rnd_0: 1 - 1
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm3, xmm13, xmm3
+ ; rnd_0: 6 - 7
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq xmm3, xmm8, xmm3
+ ; rnd_1: 0 - 0
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq xmm8, xmm2, 19
+ vpsllq xmm9, xmm2, 45
+ ; rnd_1: 1 - 1
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq xmm10, xmm2, 61
+ vpsllq xmm11, xmm2, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm2, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq xmm3, xmm8, xmm3
+ ; msg_sched done: 6-7
+ ; msg_sched: 8-9
+ ; rnd_0: 0 - 0
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr xmm12, xmm5, xmm4, 8
+ vpalignr xmm13, xmm1, xmm0, 8
+ ; rnd_0: 1 - 1
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm4, xmm13, xmm4
+ ; rnd_0: 6 - 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq xmm4, xmm8, xmm4
+ ; rnd_1: 0 - 0
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq xmm8, xmm3, 19
+ vpsllq xmm9, xmm3, 45
+ ; rnd_1: 1 - 1
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq xmm10, xmm3, 61
+ vpsllq xmm11, xmm3, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm3, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq xmm4, xmm8, xmm4
+ ; msg_sched done: 8-9
+ ; msg_sched: 10-11
+ ; rnd_0: 0 - 0
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr xmm12, xmm6, xmm5, 8
+ vpalignr xmm13, xmm2, xmm1, 8
+ ; rnd_0: 1 - 1
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm5, xmm13, xmm5
+ ; rnd_0: 6 - 7
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq xmm5, xmm8, xmm5
+ ; rnd_1: 0 - 0
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq xmm8, xmm4, 19
+ vpsllq xmm9, xmm4, 45
+ ; rnd_1: 1 - 1
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq xmm10, xmm4, 61
+ vpsllq xmm11, xmm4, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm4, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq xmm5, xmm8, xmm5
+ ; msg_sched done: 10-11
+ ; msg_sched: 12-13
+ ; rnd_0: 0 - 0
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr xmm12, xmm7, xmm6, 8
+ vpalignr xmm13, xmm3, xmm2, 8
+ ; rnd_0: 1 - 1
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm6, xmm13, xmm6
+ ; rnd_0: 6 - 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq xmm6, xmm8, xmm6
+ ; rnd_1: 0 - 0
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq xmm8, xmm5, 19
+ vpsllq xmm9, xmm5, 45
+ ; rnd_1: 1 - 1
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq xmm10, xmm5, 61
+ vpsllq xmm11, xmm5, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm5, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq xmm6, xmm8, xmm6
+ ; msg_sched done: 12-13
+ ; msg_sched: 14-15
+ ; rnd_0: 0 - 0
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr xmm12, xmm0, xmm7, 8
+ vpalignr xmm13, xmm4, xmm3, 8
+ ; rnd_0: 1 - 1
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm7, xmm13, xmm7
+ ; rnd_0: 6 - 7
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq xmm7, xmm8, xmm7
+ ; rnd_1: 0 - 0
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq xmm8, xmm6, 19
+ vpsllq xmm9, xmm6, 45
+ ; rnd_1: 1 - 1
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq xmm10, xmm6, 61
+ vpsllq xmm11, xmm6, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm6, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq xmm7, xmm8, xmm7
+ ; msg_sched done: 14-15
+ vpaddq xmm8, xmm0, [rsi]
+ vpaddq xmm9, xmm1, [rsi+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rsi+32]
+ vpaddq xmm9, xmm3, [rsi+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rsi+64]
+ vpaddq xmm9, xmm5, [rsi+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rsi+96]
+ vpaddq xmm9, xmm7, [rsi+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ sub DWORD PTR [rsp+128], 1
+ jne L_transform_sha512_avx1_rorx_start
+ ; rnd_all_2: 0-1
+ ; rnd_0: 0 - 7
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ ; rnd_1: 0 - 7
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 2-3
+ ; rnd_0: 0 - 7
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ ; rnd_1: 0 - 7
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 4-5
+ ; rnd_0: 0 - 7
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ ; rnd_1: 0 - 7
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 6-7
+ ; rnd_0: 0 - 7
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ ; rnd_1: 0 - 7
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ ; rnd_all_2: 8-9
+ ; rnd_0: 0 - 7
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ ; rnd_1: 0 - 7
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 10-11
+ ; rnd_0: 0 - 7
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ ; rnd_1: 0 - 7
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 12-13
+ ; rnd_0: 0 - 7
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ ; rnd_1: 0 - 7
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 14-15
+ ; rnd_0: 0 - 7
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ ; rnd_1: 0 - 7
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ add r8, rdx
+ add QWORD PTR [rdi], r8
+ add QWORD PTR [rdi+8], r9
+ add QWORD PTR [rdi+16], r10
+ add QWORD PTR [rdi+24], r11
+ add QWORD PTR [rdi+32], r12
+ add QWORD PTR [rdi+40], r13
+ add QWORD PTR [rdi+48], r14
+ add QWORD PTR [rdi+56], r15
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+136]
+ vmovdqu xmm7, OWORD PTR [rsp+152]
+ vmovdqu xmm8, OWORD PTR [rsp+168]
+ vmovdqu xmm9, OWORD PTR [rsp+184]
+ vmovdqu xmm10, OWORD PTR [rsp+200]
+ vmovdqu xmm11, OWORD PTR [rsp+216]
+ vmovdqu xmm13, OWORD PTR [rsp+232]
+ vmovdqu xmm12, OWORD PTR [rsp+248]
+ vmovdqu xmm14, OWORD PTR [rsp+264]
+ add rsp, 280
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX1_RORX ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX1_RORX_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rbp, rdx
+ sub rsp, 288
+ vmovdqu OWORD PTR [rsp+144], xmm6
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ vmovdqu OWORD PTR [rsp+176], xmm8
+ vmovdqu OWORD PTR [rsp+192], xmm9
+ vmovdqu OWORD PTR [rsp+208], xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm13
+ vmovdqu OWORD PTR [rsp+256], xmm12
+ vmovdqu OWORD PTR [rsp+272], xmm14
+ mov rsi, QWORD PTR [rdi+224]
+ mov rcx, QWORD PTR [ptr_L_avx1_rorx_sha512_k]
+ vmovdqa xmm14, OWORD PTR L_avx1_rorx_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ ; Start of loop processing a block
+L_sha512_len_avx1_rorx_begin:
+ vmovdqu xmm0, OWORD PTR [rsi]
+ vmovdqu xmm1, OWORD PTR [rsi+16]
+ vpshufb xmm0, xmm0, xmm14
+ vpshufb xmm1, xmm1, xmm14
+ vmovdqu xmm2, OWORD PTR [rsi+32]
+ vmovdqu xmm3, OWORD PTR [rsi+48]
+ vpshufb xmm2, xmm2, xmm14
+ vpshufb xmm3, xmm3, xmm14
+ vmovdqu xmm4, OWORD PTR [rsi+64]
+ vmovdqu xmm5, OWORD PTR [rsi+80]
+ vpshufb xmm4, xmm4, xmm14
+ vpshufb xmm5, xmm5, xmm14
+ vmovdqu xmm6, OWORD PTR [rsi+96]
+ vmovdqu xmm7, OWORD PTR [rsi+112]
+ vpshufb xmm6, xmm6, xmm14
+ vpshufb xmm7, xmm7, xmm14
+ mov DWORD PTR [rsp+128], 4
+ mov rbx, r9
+ xor rdx, rdx
+ xor rbx, r10
+ vpaddq xmm8, xmm0, [rcx]
+ vpaddq xmm9, xmm1, [rcx+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rcx+32]
+ vpaddq xmm9, xmm3, [rcx+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rcx+64]
+ vpaddq xmm9, xmm5, [rcx+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rcx+96]
+ vpaddq xmm9, xmm7, [rcx+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ ; Start of 16 rounds
+L_sha512_len_avx1_rorx_start:
+ add rcx, 128
+ mov QWORD PTR [rsp+136], rcx
+ ; msg_sched: 0-1
+ ; rnd_0: 0 - 0
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr xmm12, xmm1, xmm0, 8
+ vpalignr xmm13, xmm5, xmm4, 8
+ ; rnd_0: 1 - 1
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm0, xmm13, xmm0
+ ; rnd_0: 6 - 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq xmm0, xmm8, xmm0
+ ; rnd_1: 0 - 0
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq xmm8, xmm7, 19
+ vpsllq xmm9, xmm7, 45
+ ; rnd_1: 1 - 1
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq xmm10, xmm7, 61
+ vpsllq xmm11, xmm7, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm7, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq xmm0, xmm8, xmm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 2-3
+ ; rnd_0: 0 - 0
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr xmm12, xmm2, xmm1, 8
+ vpalignr xmm13, xmm6, xmm5, 8
+ ; rnd_0: 1 - 1
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm1, xmm13, xmm1
+ ; rnd_0: 6 - 7
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq xmm1, xmm8, xmm1
+ ; rnd_1: 0 - 0
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq xmm8, xmm0, 19
+ vpsllq xmm9, xmm0, 45
+ ; rnd_1: 1 - 1
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq xmm10, xmm0, 61
+ vpsllq xmm11, xmm0, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm0, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq xmm1, xmm8, xmm1
+ ; msg_sched done: 2-3
+ ; msg_sched: 4-5
+ ; rnd_0: 0 - 0
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr xmm12, xmm3, xmm2, 8
+ vpalignr xmm13, xmm7, xmm6, 8
+ ; rnd_0: 1 - 1
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm2, xmm13, xmm2
+ ; rnd_0: 6 - 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq xmm2, xmm8, xmm2
+ ; rnd_1: 0 - 0
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq xmm8, xmm1, 19
+ vpsllq xmm9, xmm1, 45
+ ; rnd_1: 1 - 1
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq xmm10, xmm1, 61
+ vpsllq xmm11, xmm1, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm1, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq xmm2, xmm8, xmm2
+ ; msg_sched done: 4-5
+ ; msg_sched: 6-7
+ ; rnd_0: 0 - 0
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr xmm12, xmm4, xmm3, 8
+ vpalignr xmm13, xmm0, xmm7, 8
+ ; rnd_0: 1 - 1
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm3, xmm13, xmm3
+ ; rnd_0: 6 - 7
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq xmm3, xmm8, xmm3
+ ; rnd_1: 0 - 0
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq xmm8, xmm2, 19
+ vpsllq xmm9, xmm2, 45
+ ; rnd_1: 1 - 1
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq xmm10, xmm2, 61
+ vpsllq xmm11, xmm2, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm2, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq xmm3, xmm8, xmm3
+ ; msg_sched done: 6-7
+ ; msg_sched: 8-9
+ ; rnd_0: 0 - 0
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr xmm12, xmm5, xmm4, 8
+ vpalignr xmm13, xmm1, xmm0, 8
+ ; rnd_0: 1 - 1
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm4, xmm13, xmm4
+ ; rnd_0: 6 - 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq xmm4, xmm8, xmm4
+ ; rnd_1: 0 - 0
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq xmm8, xmm3, 19
+ vpsllq xmm9, xmm3, 45
+ ; rnd_1: 1 - 1
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq xmm10, xmm3, 61
+ vpsllq xmm11, xmm3, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm3, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq xmm4, xmm8, xmm4
+ ; msg_sched done: 8-9
+ ; msg_sched: 10-11
+ ; rnd_0: 0 - 0
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr xmm12, xmm6, xmm5, 8
+ vpalignr xmm13, xmm2, xmm1, 8
+ ; rnd_0: 1 - 1
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm5, xmm13, xmm5
+ ; rnd_0: 6 - 7
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq xmm5, xmm8, xmm5
+ ; rnd_1: 0 - 0
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq xmm8, xmm4, 19
+ vpsllq xmm9, xmm4, 45
+ ; rnd_1: 1 - 1
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq xmm10, xmm4, 61
+ vpsllq xmm11, xmm4, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm4, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq xmm5, xmm8, xmm5
+ ; msg_sched done: 10-11
+ ; msg_sched: 12-13
+ ; rnd_0: 0 - 0
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr xmm12, xmm7, xmm6, 8
+ vpalignr xmm13, xmm3, xmm2, 8
+ ; rnd_0: 1 - 1
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm6, xmm13, xmm6
+ ; rnd_0: 6 - 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq xmm6, xmm8, xmm6
+ ; rnd_1: 0 - 0
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq xmm8, xmm5, 19
+ vpsllq xmm9, xmm5, 45
+ ; rnd_1: 1 - 1
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq xmm10, xmm5, 61
+ vpsllq xmm11, xmm5, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm5, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq xmm6, xmm8, xmm6
+ ; msg_sched done: 12-13
+ ; msg_sched: 14-15
+ ; rnd_0: 0 - 0
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr xmm12, xmm0, xmm7, 8
+ vpalignr xmm13, xmm4, xmm3, 8
+ ; rnd_0: 1 - 1
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ vpsrlq xmm8, xmm12, 1
+ vpsllq xmm9, xmm12, 63
+ ; rnd_0: 2 - 2
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq xmm10, xmm12, 8
+ vpsllq xmm11, xmm12, 56
+ ; rnd_0: 3 - 3
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_0: 4 - 4
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq xmm11, xmm12, 7
+ vpxor xmm8, xmm8, xmm10
+ ; rnd_0: 5 - 5
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpxor xmm8, xmm8, xmm11
+ vpaddq xmm7, xmm13, xmm7
+ ; rnd_0: 6 - 7
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq xmm7, xmm8, xmm7
+ ; rnd_1: 0 - 0
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq xmm8, xmm6, 19
+ vpsllq xmm9, xmm6, 45
+ ; rnd_1: 1 - 1
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq xmm10, xmm6, 61
+ vpsllq xmm11, xmm6, 3
+ ; rnd_1: 2 - 2
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor xmm8, xmm8, xmm9
+ vpor xmm10, xmm10, xmm11
+ ; rnd_1: 3 - 4
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor xmm8, xmm8, xmm10
+ vpsrlq xmm11, xmm6, 6
+ ; rnd_1: 5 - 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ vpxor xmm8, xmm8, xmm11
+ ; rnd_1: 7 - 7
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq xmm7, xmm8, xmm7
+ ; msg_sched done: 14-15
+ mov rcx, QWORD PTR [rsp+136]
+ vpaddq xmm8, xmm0, [rcx]
+ vpaddq xmm9, xmm1, [rcx+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rcx+32]
+ vpaddq xmm9, xmm3, [rcx+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rcx+64]
+ vpaddq xmm9, xmm5, [rcx+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rcx+96]
+ vpaddq xmm9, xmm7, [rcx+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ sub DWORD PTR [rsp+128], 1
+ jne L_sha512_len_avx1_rorx_start
+ vpaddq xmm8, xmm0, [rcx]
+ vpaddq xmm9, xmm1, [rcx+16]
+ vmovdqu OWORD PTR [rsp], xmm8
+ vmovdqu OWORD PTR [rsp+16], xmm9
+ vpaddq xmm8, xmm2, [rcx+32]
+ vpaddq xmm9, xmm3, [rcx+48]
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpaddq xmm8, xmm4, [rcx+64]
+ vpaddq xmm9, xmm5, [rcx+80]
+ vmovdqu OWORD PTR [rsp+64], xmm8
+ vmovdqu OWORD PTR [rsp+80], xmm9
+ vpaddq xmm8, xmm6, [rcx+96]
+ vpaddq xmm9, xmm7, [rcx+112]
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ ; rnd_all_2: 0-1
+ ; rnd_0: 0 - 7
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ ; rnd_1: 0 - 7
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 2-3
+ ; rnd_0: 0 - 7
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ ; rnd_1: 0 - 7
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 4-5
+ ; rnd_0: 0 - 7
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ ; rnd_1: 0 - 7
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 6-7
+ ; rnd_0: 0 - 7
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ ; rnd_1: 0 - 7
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ ; rnd_all_2: 8-9
+ ; rnd_0: 0 - 7
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ ; rnd_1: 0 - 7
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ add r10, r14
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 10-11
+ ; rnd_0: 0 - 7
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ ; rnd_1: 0 - 7
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ add r8, r12
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 12-13
+ ; rnd_0: 0 - 7
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ ; rnd_1: 0 - 7
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ add r14, r10
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 14-15
+ ; rnd_0: 0 - 7
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ ; rnd_1: 0 - 7
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ add r12, r8
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ add r8, rdx
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov rcx, QWORD PTR [ptr_L_avx1_rorx_sha512_k]
+ add rsi, 128
+ sub ebp, 128
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ jnz L_sha512_len_avx1_rorx_begin
+ xor rax, rax
+ vmovdqu xmm6, OWORD PTR [rsp+144]
+ vmovdqu xmm7, OWORD PTR [rsp+160]
+ vmovdqu xmm8, OWORD PTR [rsp+176]
+ vmovdqu xmm9, OWORD PTR [rsp+192]
+ vmovdqu xmm10, OWORD PTR [rsp+208]
+ vmovdqu xmm11, OWORD PTR [rsp+224]
+ vmovdqu xmm13, OWORD PTR [rsp+240]
+ vmovdqu xmm12, OWORD PTR [rsp+256]
+ vmovdqu xmm14, OWORD PTR [rsp+272]
+ add rsp, 288
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX1_RORX_Len ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx2_sha512_k QWORD L_avx2_sha512_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha512_k_2 QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx2_sha512_k_2 QWORD L_avx2_sha512_k_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 8
+L_avx2_sha512_k_2_end QWORD 1024+L_avx2_sha512_k_2
+ptr_L_avx2_sha512_k_2_end QWORD L_avx2_sha512_k_2_end
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_avx2_sha512_flip_mask QWORD L_avx2_sha512_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ sub rsp, 296
+ vmovdqu OWORD PTR [rsp+136], xmm6
+ vmovdqu OWORD PTR [rsp+152], xmm7
+ vmovdqu OWORD PTR [rsp+168], xmm8
+ vmovdqu OWORD PTR [rsp+184], xmm9
+ vmovdqu OWORD PTR [rsp+200], xmm10
+ vmovdqu OWORD PTR [rsp+216], xmm11
+ vmovdqu OWORD PTR [rsp+232], xmm14
+ vmovdqu OWORD PTR [rsp+248], xmm13
+ vmovdqu OWORD PTR [rsp+264], xmm12
+ vmovdqu OWORD PTR [rsp+280], xmm15
+ lea rax, QWORD PTR [rdi+64]
+ vmovdqu ymm15, YMMWORD PTR L_avx2_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ vmovdqu ymm0, YMMWORD PTR [rax]
+ vmovdqu ymm1, YMMWORD PTR [rax+32]
+ vpshufb ymm0, ymm0, ymm15
+ vpshufb ymm1, ymm1, ymm15
+ vmovdqu ymm2, YMMWORD PTR [rax+64]
+ vmovdqu ymm3, YMMWORD PTR [rax+96]
+ vpshufb ymm2, ymm2, ymm15
+ vpshufb ymm3, ymm3, ymm15
+ mov DWORD PTR [rsp+128], 4
+ mov rsi, QWORD PTR [ptr_L_avx2_sha512_k]
+ mov rbx, r9
+ mov rax, r12
+ xor rbx, r10
+ vpaddq ymm8, ymm0, [rsi]
+ vpaddq ymm9, ymm1, [rsi+32]
+ vmovdqu YMMWORD PTR [rsp], ymm8
+ vmovdqu YMMWORD PTR [rsp+32], ymm9
+ vpaddq ymm8, ymm2, [rsi+64]
+ vpaddq ymm9, ymm3, [rsi+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm8
+ vmovdqu YMMWORD PTR [rsp+96], ymm9
+ ; Start of 16 rounds
+L_sha256_avx2_start:
+ add rsi, 128
+ ror rax, 23
+ vpblendd ymm12, ymm0, ymm1, 3
+ vpblendd ymm13, ymm2, ymm3, 3
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ vpermq ymm12, ymm12, 57
+ ror rax, 4
+ xor rcx, r14
+ vpermq ymm13, ymm13, 57
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpsrlq ymm8, ymm12, 1
+ add r15, rax
+ mov rcx, r8
+ vpsllq ymm9, ymm12, 63
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm10, ymm12, 8
+ xor rcx, r8
+ xor rbx, r9
+ vpsllq ymm11, ymm12, 56
+ ror rcx, 6
+ add r11, r15
+ vpor ymm8, ymm8, ymm9
+ xor rcx, r8
+ add r15, rbx
+ vpor ymm10, ymm10, ymm11
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ vpsrlq ymm11, ymm12, 7
+ mov rbx, r15
+ mov rcx, r12
+ vpxor ymm8, ymm8, ymm10
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ vpxor ymm8, ymm8, ymm11
+ xor rax, r11
+ and rcx, r11
+ vpaddq ymm0, ymm13, ymm0
+ ror rax, 4
+ xor rcx, r13
+ vpaddq ymm0, ymm8, ymm0
+ xor rax, r11
+ add r14, rcx
+ vperm2I128 ymm14, ymm3, ymm3, 129
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r15
+ xor rdx, r8
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 6
+ add r10, r14
+ vpsrlq ymm10, ymm14, 61
+ xor rcx, r15
+ add r14, rdx
+ vpsllq ymm11, ymm14, 3
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ vpor ymm10, ymm10, ymm11
+ xor rax, r10
+ and rcx, r10
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq ymm11, ymm14, 6
+ xor rax, r10
+ add r13, rcx
+ vpxor ymm8, ymm8, ymm11
+ ror rax, 14
+ xor rdx, r15
+ vpaddq ymm0, ymm8, ymm0
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vperm2I128 ymm14, ymm0, ymm0, 8
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r14
+ add r13, rbx
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ vpsrlq ymm10, ymm14, 61
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ vpsllq ymm11, ymm14, 3
+ xor rax, r9
+ and rcx, r9
+ vpor ymm8, ymm8, ymm9
+ ror rax, 4
+ xor rcx, r11
+ vpor ymm10, ymm10, ymm11
+ xor rax, r9
+ add r12, rcx
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 14
+ xor rbx, r14
+ vpsrlq ymm11, ymm14, 6
+ add r12, rax
+ mov rcx, r13
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ vpaddq ymm0, ymm8, ymm0
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ vpblendd ymm12, ymm1, ymm2, 3
+ vpblendd ymm13, ymm3, ymm0, 3
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ vpermq ymm12, ymm12, 57
+ ror rax, 4
+ xor rcx, r10
+ vpermq ymm13, ymm13, 57
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpsrlq ymm8, ymm12, 1
+ add r11, rax
+ mov rcx, r12
+ vpsllq ymm9, ymm12, 63
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm10, ymm12, 8
+ xor rcx, r12
+ xor rbx, r13
+ vpsllq ymm11, ymm12, 56
+ ror rcx, 6
+ add r15, r11
+ vpor ymm8, ymm8, ymm9
+ xor rcx, r12
+ add r11, rbx
+ vpor ymm10, ymm10, ymm11
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ vpsrlq ymm11, ymm12, 7
+ mov rbx, r11
+ mov rcx, r8
+ vpxor ymm8, ymm8, ymm10
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ vpxor ymm8, ymm8, ymm11
+ xor rax, r15
+ and rcx, r15
+ vpaddq ymm1, ymm13, ymm1
+ ror rax, 4
+ xor rcx, r9
+ vpaddq ymm1, ymm8, ymm1
+ xor rax, r15
+ add r10, rcx
+ vperm2I128 ymm14, ymm0, ymm0, 129
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r11
+ xor rdx, r12
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 6
+ add r14, r10
+ vpsrlq ymm10, ymm14, 61
+ xor rcx, r11
+ add r10, rdx
+ vpsllq ymm11, ymm14, 3
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ vpor ymm10, ymm10, ymm11
+ xor rax, r14
+ and rcx, r14
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq ymm11, ymm14, 6
+ xor rax, r14
+ add r9, rcx
+ vpxor ymm8, ymm8, ymm11
+ ror rax, 14
+ xor rdx, r11
+ vpaddq ymm1, ymm8, ymm1
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vperm2I128 ymm14, ymm1, ymm1, 8
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r10
+ add r9, rbx
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ vpsrlq ymm10, ymm14, 61
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ vpsllq ymm11, ymm14, 3
+ xor rax, r13
+ and rcx, r13
+ vpor ymm8, ymm8, ymm9
+ ror rax, 4
+ xor rcx, r15
+ vpor ymm10, ymm10, ymm11
+ xor rax, r13
+ add r8, rcx
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 14
+ xor rbx, r10
+ vpsrlq ymm11, ymm14, 6
+ add r8, rax
+ mov rcx, r9
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ vpaddq ymm1, ymm8, ymm1
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ror rax, 23
+ vpblendd ymm12, ymm2, ymm3, 3
+ vpblendd ymm13, ymm0, ymm1, 3
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ vpermq ymm12, ymm12, 57
+ ror rax, 4
+ xor rcx, r14
+ vpermq ymm13, ymm13, 57
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpsrlq ymm8, ymm12, 1
+ add r15, rax
+ mov rcx, r8
+ vpsllq ymm9, ymm12, 63
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm10, ymm12, 8
+ xor rcx, r8
+ xor rbx, r9
+ vpsllq ymm11, ymm12, 56
+ ror rcx, 6
+ add r11, r15
+ vpor ymm8, ymm8, ymm9
+ xor rcx, r8
+ add r15, rbx
+ vpor ymm10, ymm10, ymm11
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ vpsrlq ymm11, ymm12, 7
+ mov rbx, r15
+ mov rcx, r12
+ vpxor ymm8, ymm8, ymm10
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ vpxor ymm8, ymm8, ymm11
+ xor rax, r11
+ and rcx, r11
+ vpaddq ymm2, ymm13, ymm2
+ ror rax, 4
+ xor rcx, r13
+ vpaddq ymm2, ymm8, ymm2
+ xor rax, r11
+ add r14, rcx
+ vperm2I128 ymm14, ymm1, ymm1, 129
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r15
+ xor rdx, r8
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 6
+ add r10, r14
+ vpsrlq ymm10, ymm14, 61
+ xor rcx, r15
+ add r14, rdx
+ vpsllq ymm11, ymm14, 3
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ vpor ymm10, ymm10, ymm11
+ xor rax, r10
+ and rcx, r10
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq ymm11, ymm14, 6
+ xor rax, r10
+ add r13, rcx
+ vpxor ymm8, ymm8, ymm11
+ ror rax, 14
+ xor rdx, r15
+ vpaddq ymm2, ymm8, ymm2
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vperm2I128 ymm14, ymm2, ymm2, 8
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r14
+ add r13, rbx
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ vpsrlq ymm10, ymm14, 61
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ vpsllq ymm11, ymm14, 3
+ xor rax, r9
+ and rcx, r9
+ vpor ymm8, ymm8, ymm9
+ ror rax, 4
+ xor rcx, r11
+ vpor ymm10, ymm10, ymm11
+ xor rax, r9
+ add r12, rcx
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 14
+ xor rbx, r14
+ vpsrlq ymm11, ymm14, 6
+ add r12, rax
+ mov rcx, r13
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ vpaddq ymm2, ymm8, ymm2
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ vpblendd ymm12, ymm3, ymm0, 3
+ vpblendd ymm13, ymm1, ymm2, 3
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ vpermq ymm12, ymm12, 57
+ ror rax, 4
+ xor rcx, r10
+ vpermq ymm13, ymm13, 57
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpsrlq ymm8, ymm12, 1
+ add r11, rax
+ mov rcx, r12
+ vpsllq ymm9, ymm12, 63
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm10, ymm12, 8
+ xor rcx, r12
+ xor rbx, r13
+ vpsllq ymm11, ymm12, 56
+ ror rcx, 6
+ add r15, r11
+ vpor ymm8, ymm8, ymm9
+ xor rcx, r12
+ add r11, rbx
+ vpor ymm10, ymm10, ymm11
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ vpsrlq ymm11, ymm12, 7
+ mov rbx, r11
+ mov rcx, r8
+ vpxor ymm8, ymm8, ymm10
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ vpxor ymm8, ymm8, ymm11
+ xor rax, r15
+ and rcx, r15
+ vpaddq ymm3, ymm13, ymm3
+ ror rax, 4
+ xor rcx, r9
+ vpaddq ymm3, ymm8, ymm3
+ xor rax, r15
+ add r10, rcx
+ vperm2I128 ymm14, ymm2, ymm2, 129
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r11
+ xor rdx, r12
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 6
+ add r14, r10
+ vpsrlq ymm10, ymm14, 61
+ xor rcx, r11
+ add r10, rdx
+ vpsllq ymm11, ymm14, 3
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ vpor ymm10, ymm10, ymm11
+ xor rax, r14
+ and rcx, r14
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq ymm11, ymm14, 6
+ xor rax, r14
+ add r9, rcx
+ vpxor ymm8, ymm8, ymm11
+ ror rax, 14
+ xor rdx, r11
+ vpaddq ymm3, ymm8, ymm3
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vperm2I128 ymm14, ymm3, ymm3, 8
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpsrlq ymm8, ymm14, 19
+ xor rcx, r10
+ add r9, rbx
+ vpsllq ymm9, ymm14, 45
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ vpsrlq ymm10, ymm14, 61
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ vpsllq ymm11, ymm14, 3
+ xor rax, r13
+ and rcx, r13
+ vpor ymm8, ymm8, ymm9
+ ror rax, 4
+ xor rcx, r15
+ vpor ymm10, ymm10, ymm11
+ xor rax, r13
+ add r8, rcx
+ vpxor ymm8, ymm8, ymm10
+ ror rax, 14
+ xor rbx, r10
+ vpsrlq ymm11, ymm14, 6
+ add r8, rax
+ mov rcx, r9
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ vpaddq ymm3, ymm8, ymm3
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq ymm8, ymm0, [rsi]
+ vpaddq ymm9, ymm1, [rsi+32]
+ vmovdqu YMMWORD PTR [rsp], ymm8
+ vmovdqu YMMWORD PTR [rsp+32], ymm9
+ vpaddq ymm8, ymm2, [rsi+64]
+ vpaddq ymm9, ymm3, [rsi+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm8
+ vmovdqu YMMWORD PTR [rsp+96], ymm9
+ sub DWORD PTR [rsp+128], 1
+ jne L_sha256_avx2_start
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+8]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+16]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+24]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+32]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+40]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+48]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+56]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rsp+64]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rsp+72]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rsp+80]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rsp+88]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rsp+96]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rsp+104]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rsp+112]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rsp+120]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ add QWORD PTR [rdi], r8
+ add QWORD PTR [rdi+8], r9
+ add QWORD PTR [rdi+16], r10
+ add QWORD PTR [rdi+24], r11
+ add QWORD PTR [rdi+32], r12
+ add QWORD PTR [rdi+40], r13
+ add QWORD PTR [rdi+48], r14
+ add QWORD PTR [rdi+56], r15
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+136]
+ vmovdqu xmm7, OWORD PTR [rsp+152]
+ vmovdqu xmm8, OWORD PTR [rsp+168]
+ vmovdqu xmm9, OWORD PTR [rsp+184]
+ vmovdqu xmm10, OWORD PTR [rsp+200]
+ vmovdqu xmm11, OWORD PTR [rsp+216]
+ vmovdqu xmm14, OWORD PTR [rsp+232]
+ vmovdqu xmm13, OWORD PTR [rsp+248]
+ vmovdqu xmm12, OWORD PTR [rsp+264]
+ vmovdqu xmm15, OWORD PTR [rsp+280]
+ add rsp, 296
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX2_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rsi
+ push rdi
+ push rbp
+ mov rdi, rcx
+ mov rbp, rdx
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm14
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm12
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ test bpl, 128
+ je L_sha512_len_avx2_block
+ mov rbx, QWORD PTR [rdi+224]
+ vmovdqu ymm0, YMMWORD PTR [rbx]
+ vmovdqu ymm1, YMMWORD PTR [rbx+32]
+ vmovdqu ymm2, YMMWORD PTR [rbx+64]
+ vmovdqu ymm3, YMMWORD PTR [rbx+96]
+ vmovups YMMWORD PTR [rdi+64], ymm0
+ vmovups YMMWORD PTR [rdi+96], ymm1
+ vmovups YMMWORD PTR [rdi+128], ymm2
+ vmovups YMMWORD PTR [rdi+160], ymm3
+ call Transform_Sha512_AVX2
+ add QWORD PTR [rdi+224], 128
+ sub ebp, 128
+ jz L_sha512_len_avx2_done
+L_sha512_len_avx2_block:
+ sub rsp, 1352
+ mov rcx, QWORD PTR [rdi+224]
+ vmovdqu ymm15, YMMWORD PTR L_avx2_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ mov QWORD PTR [rsp+1344], rbp
+ ; Start of loop processing two blocks
+L_sha512_len_avx2_begin:
+ mov rbp, rsp
+ mov rsi, QWORD PTR [ptr_L_avx2_sha512_k_2]
+ mov rbx, r9
+ mov rax, r12
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vmovdqu xmm1, OWORD PTR [rcx+16]
+ vinserti128 ymm0, ymm0, OWORD PTR [rcx+128], 1
+ vinserti128 ymm1, ymm1, OWORD PTR [rcx+144], 1
+ vpshufb ymm0, ymm0, ymm15
+ vpshufb ymm1, ymm1, ymm15
+ vmovdqu xmm2, OWORD PTR [rcx+32]
+ vmovdqu xmm3, OWORD PTR [rcx+48]
+ vinserti128 ymm2, ymm2, OWORD PTR [rcx+160], 1
+ vinserti128 ymm3, ymm3, OWORD PTR [rcx+176], 1
+ vpshufb ymm2, ymm2, ymm15
+ vpshufb ymm3, ymm3, ymm15
+ vmovdqu xmm4, OWORD PTR [rcx+64]
+ vmovdqu xmm5, OWORD PTR [rcx+80]
+ vinserti128 ymm4, ymm4, OWORD PTR [rcx+192], 1
+ vinserti128 ymm5, ymm5, OWORD PTR [rcx+208], 1
+ vpshufb ymm4, ymm4, ymm15
+ vpshufb ymm5, ymm5, ymm15
+ vmovdqu xmm6, OWORD PTR [rcx+96]
+ vmovdqu xmm7, OWORD PTR [rcx+112]
+ vinserti128 ymm6, ymm6, OWORD PTR [rcx+224], 1
+ vinserti128 ymm7, ymm7, OWORD PTR [rcx+240], 1
+ vpshufb ymm6, ymm6, ymm15
+ vpshufb ymm7, ymm7, ymm15
+ xor rbx, r10
+ ; Start of 16 rounds
+L_sha512_len_avx2_start:
+ vpaddq ymm8, ymm0, [rsi]
+ vpaddq ymm9, ymm1, [rsi+32]
+ vmovdqu YMMWORD PTR [rbp], ymm8
+ vmovdqu YMMWORD PTR [rbp+32], ymm9
+ vpaddq ymm8, ymm2, [rsi+64]
+ vpaddq ymm9, ymm3, [rsi+96]
+ vmovdqu YMMWORD PTR [rbp+64], ymm8
+ vmovdqu YMMWORD PTR [rbp+96], ymm9
+ vpaddq ymm8, ymm4, [rsi+128]
+ vpaddq ymm9, ymm5, [rsi+160]
+ vmovdqu YMMWORD PTR [rbp+128], ymm8
+ vmovdqu YMMWORD PTR [rbp+160], ymm9
+ vpaddq ymm8, ymm6, [rsi+192]
+ vpaddq ymm9, ymm7, [rsi+224]
+ vmovdqu YMMWORD PTR [rbp+192], ymm8
+ vmovdqu YMMWORD PTR [rbp+224], ymm9
+ ; msg_sched: 0-1
+ ror rax, 23
+ vpalignr ymm12, ymm1, ymm0, 8
+ vpalignr ymm13, ymm5, ymm4, 8
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp]
+ xor rcx, r14
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm0, ymm13, ymm0
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ vpaddq ymm0, ymm8, ymm0
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+8]
+ xor rcx, r13
+ vpsrlq ymm8, ymm7, 19
+ vpsllq ymm9, ymm7, 45
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq ymm10, ymm7, 61
+ vpsllq ymm11, ymm7, 3
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm7, 6
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq ymm0, ymm8, ymm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 4-5
+ ror rax, 23
+ vpalignr ymm12, ymm2, ymm1, 8
+ vpalignr ymm13, ymm6, ymm5, 8
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+32]
+ xor rcx, r12
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm1, ymm13, ymm1
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ vpaddq ymm1, ymm8, ymm1
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+40]
+ xor rcx, r11
+ vpsrlq ymm8, ymm0, 19
+ vpsllq ymm9, ymm0, 45
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq ymm10, ymm0, 61
+ vpsllq ymm11, ymm0, 3
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm0, 6
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq ymm1, ymm8, ymm1
+ ; msg_sched done: 4-5
+ ; msg_sched: 8-9
+ ror rax, 23
+ vpalignr ymm12, ymm3, ymm2, 8
+ vpalignr ymm13, ymm7, ymm6, 8
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+64]
+ xor rcx, r10
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm2, ymm13, ymm2
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ vpaddq ymm2, ymm8, ymm2
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+72]
+ xor rcx, r9
+ vpsrlq ymm8, ymm1, 19
+ vpsllq ymm9, ymm1, 45
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq ymm10, ymm1, 61
+ vpsllq ymm11, ymm1, 3
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm1, 6
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq ymm2, ymm8, ymm2
+ ; msg_sched done: 8-9
+ ; msg_sched: 12-13
+ ror rax, 23
+ vpalignr ymm12, ymm4, ymm3, 8
+ vpalignr ymm13, ymm0, ymm7, 8
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+96]
+ xor rcx, r8
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm3, ymm13, ymm3
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ vpaddq ymm3, ymm8, ymm3
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+104]
+ xor rcx, r15
+ vpsrlq ymm8, ymm2, 19
+ vpsllq ymm9, ymm2, 45
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq ymm10, ymm2, 61
+ vpsllq ymm11, ymm2, 3
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm2, 6
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq ymm3, ymm8, ymm3
+ ; msg_sched done: 12-13
+ ; msg_sched: 16-17
+ ror rax, 23
+ vpalignr ymm12, ymm5, ymm4, 8
+ vpalignr ymm13, ymm1, ymm0, 8
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp+128]
+ xor rcx, r14
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm4, ymm13, ymm4
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ vpaddq ymm4, ymm8, ymm4
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+136]
+ xor rcx, r13
+ vpsrlq ymm8, ymm3, 19
+ vpsllq ymm9, ymm3, 45
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ vpsrlq ymm10, ymm3, 61
+ vpsllq ymm11, ymm3, 3
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm3, 6
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ vpaddq ymm4, ymm8, ymm4
+ ; msg_sched done: 16-17
+ ; msg_sched: 20-21
+ ror rax, 23
+ vpalignr ymm12, ymm6, ymm5, 8
+ vpalignr ymm13, ymm2, ymm1, 8
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+160]
+ xor rcx, r12
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm5, ymm13, ymm5
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ vpaddq ymm5, ymm8, ymm5
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+168]
+ xor rcx, r11
+ vpsrlq ymm8, ymm4, 19
+ vpsllq ymm9, ymm4, 45
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ vpsrlq ymm10, ymm4, 61
+ vpsllq ymm11, ymm4, 3
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm4, 6
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ vpaddq ymm5, ymm8, ymm5
+ ; msg_sched done: 20-21
+ ; msg_sched: 24-25
+ ror rax, 23
+ vpalignr ymm12, ymm7, ymm6, 8
+ vpalignr ymm13, ymm3, ymm2, 8
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+192]
+ xor rcx, r10
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm6, ymm13, ymm6
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ vpaddq ymm6, ymm8, ymm6
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+200]
+ xor rcx, r9
+ vpsrlq ymm8, ymm5, 19
+ vpsllq ymm9, ymm5, 45
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ vpsrlq ymm10, ymm5, 61
+ vpsllq ymm11, ymm5, 3
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm5, 6
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ vpaddq ymm6, ymm8, ymm6
+ ; msg_sched done: 24-25
+ ; msg_sched: 28-29
+ ror rax, 23
+ vpalignr ymm12, ymm0, ymm7, 8
+ vpalignr ymm13, ymm4, ymm3, 8
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+224]
+ xor rcx, r8
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm7, ymm13, ymm7
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ vpaddq ymm7, ymm8, ymm7
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+232]
+ xor rcx, r15
+ vpsrlq ymm8, ymm6, 19
+ vpsllq ymm9, ymm6, 45
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ vpsrlq ymm10, ymm6, 61
+ vpsllq ymm11, ymm6, 3
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm6, 6
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ vpxor ymm8, ymm8, ymm11
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ vpaddq ymm7, ymm8, ymm7
+ ; msg_sched done: 28-29
+ add rsi, 256
+ add rbp, 256
+ cmp rsi, QWORD PTR [L_avx2_sha512_k_2_end]
+ jne L_sha512_len_avx2_start
+ vpaddq ymm8, ymm0, [rsi]
+ vpaddq ymm9, ymm1, [rsi+32]
+ vmovdqu YMMWORD PTR [rbp], ymm8
+ vmovdqu YMMWORD PTR [rbp+32], ymm9
+ vpaddq ymm8, ymm2, [rsi+64]
+ vpaddq ymm9, ymm3, [rsi+96]
+ vmovdqu YMMWORD PTR [rbp+64], ymm8
+ vmovdqu YMMWORD PTR [rbp+96], ymm9
+ vpaddq ymm8, ymm4, [rsi+128]
+ vpaddq ymm9, ymm5, [rsi+160]
+ vmovdqu YMMWORD PTR [rbp+128], ymm8
+ vmovdqu YMMWORD PTR [rbp+160], ymm9
+ vpaddq ymm8, ymm6, [rsi+192]
+ vpaddq ymm9, ymm7, [rsi+224]
+ vmovdqu YMMWORD PTR [rbp+192], ymm8
+ vmovdqu YMMWORD PTR [rbp+224], ymm9
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+8]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+32]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+40]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+64]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+72]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+96]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+104]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp+128]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+136]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+160]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+168]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+192]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+200]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+224]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+232]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ sub rbp, 1024
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ mov rbx, r9
+ mov rax, r12
+ xor rbx, r10
+ mov rsi, 5
+L_sha512_len_avx2_tail:
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp+16]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+24]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+48]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+56]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+80]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+88]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+112]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+120]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ ror rax, 23
+ mov rdx, r8
+ mov rcx, r13
+ add r15, QWORD PTR [rbp+144]
+ xor rcx, r14
+ xor rax, r12
+ and rcx, r12
+ ror rax, 4
+ xor rcx, r14
+ xor rax, r12
+ add r15, rcx
+ ror rax, 14
+ xor rdx, r9
+ add r15, rax
+ mov rcx, r8
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r8
+ xor rbx, r9
+ ror rcx, 6
+ add r11, r15
+ xor rcx, r8
+ add r15, rbx
+ ror rcx, 28
+ mov rax, r11
+ add r15, rcx
+ ror rax, 23
+ mov rbx, r15
+ mov rcx, r12
+ add r14, QWORD PTR [rbp+152]
+ xor rcx, r13
+ xor rax, r11
+ and rcx, r11
+ ror rax, 4
+ xor rcx, r13
+ xor rax, r11
+ add r14, rcx
+ ror rax, 14
+ xor rbx, r8
+ add r14, rax
+ mov rcx, r15
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r15
+ xor rdx, r8
+ ror rcx, 6
+ add r10, r14
+ xor rcx, r15
+ add r14, rdx
+ ror rcx, 28
+ mov rax, r10
+ add r14, rcx
+ ror rax, 23
+ mov rdx, r14
+ mov rcx, r11
+ add r13, QWORD PTR [rbp+176]
+ xor rcx, r12
+ xor rax, r10
+ and rcx, r10
+ ror rax, 4
+ xor rcx, r12
+ xor rax, r10
+ add r13, rcx
+ ror rax, 14
+ xor rdx, r15
+ add r13, rax
+ mov rcx, r14
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r14
+ xor rbx, r15
+ ror rcx, 6
+ add r9, r13
+ xor rcx, r14
+ add r13, rbx
+ ror rcx, 28
+ mov rax, r9
+ add r13, rcx
+ ror rax, 23
+ mov rbx, r13
+ mov rcx, r10
+ add r12, QWORD PTR [rbp+184]
+ xor rcx, r11
+ xor rax, r9
+ and rcx, r9
+ ror rax, 4
+ xor rcx, r11
+ xor rax, r9
+ add r12, rcx
+ ror rax, 14
+ xor rbx, r14
+ add r12, rax
+ mov rcx, r13
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r13
+ xor rdx, r14
+ ror rcx, 6
+ add r8, r12
+ xor rcx, r13
+ add r12, rdx
+ ror rcx, 28
+ mov rax, r8
+ add r12, rcx
+ ror rax, 23
+ mov rdx, r12
+ mov rcx, r9
+ add r11, QWORD PTR [rbp+208]
+ xor rcx, r10
+ xor rax, r8
+ and rcx, r8
+ ror rax, 4
+ xor rcx, r10
+ xor rax, r8
+ add r11, rcx
+ ror rax, 14
+ xor rdx, r13
+ add r11, rax
+ mov rcx, r12
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r12
+ xor rbx, r13
+ ror rcx, 6
+ add r15, r11
+ xor rcx, r12
+ add r11, rbx
+ ror rcx, 28
+ mov rax, r15
+ add r11, rcx
+ ror rax, 23
+ mov rbx, r11
+ mov rcx, r8
+ add r10, QWORD PTR [rbp+216]
+ xor rcx, r9
+ xor rax, r15
+ and rcx, r15
+ ror rax, 4
+ xor rcx, r9
+ xor rax, r15
+ add r10, rcx
+ ror rax, 14
+ xor rbx, r12
+ add r10, rax
+ mov rcx, r11
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r11
+ xor rdx, r12
+ ror rcx, 6
+ add r14, r10
+ xor rcx, r11
+ add r10, rdx
+ ror rcx, 28
+ mov rax, r14
+ add r10, rcx
+ ror rax, 23
+ mov rdx, r10
+ mov rcx, r15
+ add r9, QWORD PTR [rbp+240]
+ xor rcx, r8
+ xor rax, r14
+ and rcx, r14
+ ror rax, 4
+ xor rcx, r8
+ xor rax, r14
+ add r9, rcx
+ ror rax, 14
+ xor rdx, r11
+ add r9, rax
+ mov rcx, r10
+ and rbx, rdx
+ ror rcx, 5
+ xor rcx, r10
+ xor rbx, r11
+ ror rcx, 6
+ add r13, r9
+ xor rcx, r10
+ add r9, rbx
+ ror rcx, 28
+ mov rax, r13
+ add r9, rcx
+ ror rax, 23
+ mov rbx, r9
+ mov rcx, r14
+ add r8, QWORD PTR [rbp+248]
+ xor rcx, r15
+ xor rax, r13
+ and rcx, r13
+ ror rax, 4
+ xor rcx, r15
+ xor rax, r13
+ add r8, rcx
+ ror rax, 14
+ xor rbx, r10
+ add r8, rax
+ mov rcx, r9
+ and rdx, rbx
+ ror rcx, 5
+ xor rcx, r9
+ xor rdx, r10
+ ror rcx, 6
+ add r12, r8
+ xor rcx, r9
+ add r8, rdx
+ ror rcx, 28
+ mov rax, r12
+ add r8, rcx
+ add rbp, 256
+ sub rsi, 1
+ jnz L_sha512_len_avx2_tail
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov rcx, QWORD PTR [rdi+224]
+ add rcx, 256
+ sub DWORD PTR [rsp+1344], 256
+ mov QWORD PTR [rdi+224], rcx
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ jnz L_sha512_len_avx2_begin
+ add rsp, 1352
+L_sha512_len_avx2_done:
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm14, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm12, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop rbp
+ pop rdi
+ pop rsi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX2_Len ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx2_rorx_sha512_k QWORD L_avx2_rorx_sha512_k
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha512_k_2 QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 428a2f98d728ae22h, 7137449123ef65cdh
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 3956c25bf348b538h, 59f111f1b605d019h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 0d807aa98a3030242h, 12835b0145706fbeh
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 9bdc06a725c71235h, 0c19bf174cf692694h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 983e5152ee66dfabh, 0a831c66d2db43210h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 06ca6351e003826fh, 142929670a0e6e70h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 27b70a8546d22ffch, 2e1b21385c26c926h
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 650a73548baf63deh, 766a0abb3c77b2a8h
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 81c2c92e47edaee6h, 92722c851482353bh
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0d192e819d6ef5218h, 0d69906245565a910h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 0f40e35855771202ah, 106aa07032bbd1b8h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 748f82ee5defb2fch, 78a5636f43172f60h
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 84c87814a1f0ab72h, 8cc702081a6439ech
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 90befffa23631e28h, 0a4506cebde82bde9h
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 113f9804bef90daeh, 1b710b35131c471bh
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 28db77f523047d84h, 32caab7b40c72493h
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h
+ptr_L_avx2_rorx_sha512_k_2 QWORD L_avx2_rorx_sha512_k_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 8
+L_avx2_rorx_sha512_k_2_end QWORD 1024+L_avx2_rorx_sha512_k_2
+ptr_L_avx2_rorx_sha512_k_2_end QWORD L_avx2_rorx_sha512_k_2_end
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_rorx_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ QWORD 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_avx2_rorx_sha512_flip_mask QWORD L_avx2_rorx_sha512_flip_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX2_RORX PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ mov rdi, rcx
+ sub rsp, 296
+ vmovdqu OWORD PTR [rsp+136], xmm6
+ vmovdqu OWORD PTR [rsp+152], xmm7
+ vmovdqu OWORD PTR [rsp+168], xmm8
+ vmovdqu OWORD PTR [rsp+184], xmm9
+ vmovdqu OWORD PTR [rsp+200], xmm10
+ vmovdqu OWORD PTR [rsp+216], xmm11
+ vmovdqu OWORD PTR [rsp+232], xmm14
+ vmovdqu OWORD PTR [rsp+248], xmm13
+ vmovdqu OWORD PTR [rsp+264], xmm12
+ vmovdqu OWORD PTR [rsp+280], xmm15
+ lea rcx, QWORD PTR [rdi+64]
+ vmovdqu ymm15, YMMWORD PTR L_avx2_rorx_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vpshufb ymm0, ymm0, ymm15
+ vpshufb ymm1, ymm1, ymm15
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpshufb ymm2, ymm2, ymm15
+ vpshufb ymm3, ymm3, ymm15
+ mov DWORD PTR [rsp+128], 4
+ mov rsi, QWORD PTR [ptr_L_avx2_rorx_sha512_k]
+ mov rbx, r9
+ xor rdx, rdx
+ xor rbx, r10
+ ; set_w_k: 0
+ vpaddq ymm8, ymm0, [rsi]
+ vpaddq ymm9, ymm1, [rsi+32]
+ vmovdqu YMMWORD PTR [rsp], ymm8
+ vmovdqu YMMWORD PTR [rsp+32], ymm9
+ vpaddq ymm8, ymm2, [rsi+64]
+ vpaddq ymm9, ymm3, [rsi+96]
+ vmovdqu YMMWORD PTR [rsp+64], ymm8
+ vmovdqu YMMWORD PTR [rsp+96], ymm9
+ ; Start of 16 rounds
+L_sha256_len_avx2_rorx_start:
+ add rsi, 128
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpblendd ymm12, ymm0, ymm1, 3
+ vpblendd ymm13, ymm2, ymm3, 3
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ vpermq ymm12, ymm12, 57
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpermq ymm13, ymm13, 57
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ vperm2I128 ymm14, ymm3, ymm3, 129
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpxor ymm8, ymm8, ymm10
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm11
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpaddq ymm0, ymm13, ymm0
+ vpaddq ymm0, ymm8, ymm0
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ vpor ymm10, ymm10, ymm11
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm10
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpsrlq ymm11, ymm14, 6
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpxor ymm8, ymm8, ymm11
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ vpaddq ymm0, ymm8, ymm0
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vperm2I128 ymm14, ymm0, ymm0, 8
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ vpor ymm10, ymm10, ymm11
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm10
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq ymm11, ymm14, 6
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpxor ymm8, ymm8, ymm11
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpaddq ymm0, ymm8, ymm0
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ vpaddq ymm8, ymm0, [rsi]
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vmovdqu YMMWORD PTR [rsp], ymm8
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpblendd ymm12, ymm1, ymm2, 3
+ vpblendd ymm13, ymm3, ymm0, 3
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ vpermq ymm12, ymm12, 57
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpermq ymm13, ymm13, 57
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ vperm2I128 ymm14, ymm0, ymm0, 129
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpxor ymm8, ymm8, ymm10
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm11
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpaddq ymm1, ymm13, ymm1
+ vpaddq ymm1, ymm8, ymm1
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ vpor ymm10, ymm10, ymm11
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm10
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpsrlq ymm11, ymm14, 6
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpxor ymm8, ymm8, ymm11
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ vpaddq ymm1, ymm8, ymm1
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vperm2I128 ymm14, ymm1, ymm1, 8
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ vpor ymm10, ymm10, ymm11
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm10
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq ymm11, ymm14, 6
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpxor ymm8, ymm8, ymm11
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpaddq ymm1, ymm8, ymm1
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ vpaddq ymm8, ymm1, [rsi+32]
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vmovdqu YMMWORD PTR [rsp+32], ymm8
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpblendd ymm12, ymm2, ymm3, 3
+ vpblendd ymm13, ymm0, ymm1, 3
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ vpermq ymm12, ymm12, 57
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpermq ymm13, ymm13, 57
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ vperm2I128 ymm14, ymm1, ymm1, 129
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpxor ymm8, ymm8, ymm10
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm11
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpaddq ymm2, ymm13, ymm2
+ vpaddq ymm2, ymm8, ymm2
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ vpor ymm10, ymm10, ymm11
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm10
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpsrlq ymm11, ymm14, 6
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpxor ymm8, ymm8, ymm11
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ vpaddq ymm2, ymm8, ymm2
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vperm2I128 ymm14, ymm2, ymm2, 8
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ vpor ymm10, ymm10, ymm11
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm10
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq ymm11, ymm14, 6
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpxor ymm8, ymm8, ymm11
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpaddq ymm2, ymm8, ymm2
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ vpaddq ymm8, ymm2, [rsi+64]
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vmovdqu YMMWORD PTR [rsp+64], ymm8
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpblendd ymm12, ymm3, ymm0, 3
+ vpblendd ymm13, ymm1, ymm2, 3
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ vpermq ymm12, ymm12, 57
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpermq ymm13, ymm13, 57
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ vperm2I128 ymm14, ymm2, ymm2, 129
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpxor ymm8, ymm8, ymm10
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm11
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpaddq ymm3, ymm13, ymm3
+ vpaddq ymm3, ymm8, ymm3
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ vpor ymm10, ymm10, ymm11
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm10
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpsrlq ymm11, ymm14, 6
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpxor ymm8, ymm8, ymm11
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ vpaddq ymm3, ymm8, ymm3
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vperm2I128 ymm14, ymm3, ymm3, 8
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpsrlq ymm8, ymm14, 19
+ vpsllq ymm9, ymm14, 45
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpsrlq ymm10, ymm14, 61
+ vpsllq ymm11, ymm14, 3
+ vpor ymm8, ymm8, ymm9
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ vpor ymm10, ymm10, ymm11
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm10
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq ymm11, ymm14, 6
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpxor ymm8, ymm8, ymm11
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpaddq ymm3, ymm8, ymm3
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ vpaddq ymm8, ymm3, [rsi+96]
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vmovdqu YMMWORD PTR [rsp+96], ymm8
+ sub DWORD PTR [rsp+128], 1
+ jne L_sha256_len_avx2_rorx_start
+ ; rnd_all_4: 0-3
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+8]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+16]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+24]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_4: 4-7
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+32]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+40]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+48]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+56]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ ; rnd_all_4: 8-11
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsp+64]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsp+72]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsp+80]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsp+88]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_4: 12-15
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsp+96]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsp+104]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsp+112]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsp+120]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ add r8, rdx
+ add QWORD PTR [rdi], r8
+ add QWORD PTR [rdi+8], r9
+ add QWORD PTR [rdi+16], r10
+ add QWORD PTR [rdi+24], r11
+ add QWORD PTR [rdi+32], r12
+ add QWORD PTR [rdi+40], r13
+ add QWORD PTR [rdi+48], r14
+ add QWORD PTR [rdi+56], r15
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+136]
+ vmovdqu xmm7, OWORD PTR [rsp+152]
+ vmovdqu xmm8, OWORD PTR [rsp+168]
+ vmovdqu xmm9, OWORD PTR [rsp+184]
+ vmovdqu xmm10, OWORD PTR [rsp+200]
+ vmovdqu xmm11, OWORD PTR [rsp+216]
+ vmovdqu xmm14, OWORD PTR [rsp+232]
+ vmovdqu xmm13, OWORD PTR [rsp+248]
+ vmovdqu xmm12, OWORD PTR [rsp+264]
+ vmovdqu xmm15, OWORD PTR [rsp+280]
+ add rsp, 296
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX2_RORX ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+Transform_Sha512_AVX2_RORX_Len PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm14
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm12
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ test sil, 128
+ je L_sha512_len_avx2_rorx_block
+ mov rax, QWORD PTR [rdi+224]
+ push rsi
+ vmovdqu ymm0, YMMWORD PTR [rax]
+ vmovdqu ymm1, YMMWORD PTR [rax+32]
+ vmovdqu ymm2, YMMWORD PTR [rax+64]
+ vmovdqu ymm3, YMMWORD PTR [rax+96]
+ vmovups YMMWORD PTR [rdi+64], ymm0
+ vmovups YMMWORD PTR [rdi+96], ymm1
+ vmovups YMMWORD PTR [rdi+128], ymm2
+ vmovups YMMWORD PTR [rdi+160], ymm3
+ call Transform_Sha512_AVX2_RORX
+ pop rsi
+ add QWORD PTR [rdi+224], 128
+ sub esi, 128
+ jz L_sha512_len_avx2_rorx_done
+L_sha512_len_avx2_rorx_block:
+ sub rsp, 1352
+ mov rax, QWORD PTR [rdi+224]
+ vmovdqu ymm15, YMMWORD PTR L_avx2_rorx_sha512_flip_mask
+ mov r8, QWORD PTR [rdi]
+ mov r9, QWORD PTR [rdi+8]
+ mov r10, QWORD PTR [rdi+16]
+ mov r11, QWORD PTR [rdi+24]
+ mov r12, QWORD PTR [rdi+32]
+ mov r13, QWORD PTR [rdi+40]
+ mov r14, QWORD PTR [rdi+48]
+ mov r15, QWORD PTR [rdi+56]
+ mov DWORD PTR [rsp+1344], esi
+ ; Start of loop processing two blocks
+L_sha512_len_avx2_rorx_begin:
+ mov rsi, rsp
+ mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha512_k_2]
+ mov rbx, r9
+ xor rdx, rdx
+ vmovdqu xmm0, OWORD PTR [rax]
+ vmovdqu xmm1, OWORD PTR [rax+16]
+ vinserti128 ymm0, ymm0, OWORD PTR [rax+128], 1
+ vinserti128 ymm1, ymm1, OWORD PTR [rax+144], 1
+ vpshufb ymm0, ymm0, ymm15
+ vpshufb ymm1, ymm1, ymm15
+ vmovdqu xmm2, OWORD PTR [rax+32]
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vinserti128 ymm2, ymm2, OWORD PTR [rax+160], 1
+ vinserti128 ymm3, ymm3, OWORD PTR [rax+176], 1
+ vpshufb ymm2, ymm2, ymm15
+ vpshufb ymm3, ymm3, ymm15
+ vmovdqu xmm4, OWORD PTR [rax+64]
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vinserti128 ymm4, ymm4, OWORD PTR [rax+192], 1
+ vinserti128 ymm5, ymm5, OWORD PTR [rax+208], 1
+ vpshufb ymm4, ymm4, ymm15
+ vpshufb ymm5, ymm5, ymm15
+ vmovdqu xmm6, OWORD PTR [rax+96]
+ vmovdqu xmm7, OWORD PTR [rax+112]
+ vinserti128 ymm6, ymm6, OWORD PTR [rax+224], 1
+ vinserti128 ymm7, ymm7, OWORD PTR [rax+240], 1
+ vpshufb ymm6, ymm6, ymm15
+ vpshufb ymm7, ymm7, ymm15
+ xor rbx, r10
+ ; Start of 16 rounds
+L_sha512_len_avx2_rorx_start:
+ vpaddq ymm8, ymm0, [rbp]
+ vpaddq ymm9, ymm1, [rbp+32]
+ vmovdqu YMMWORD PTR [rsi], ymm8
+ vmovdqu YMMWORD PTR [rsi+32], ymm9
+ vpaddq ymm8, ymm2, [rbp+64]
+ vpaddq ymm9, ymm3, [rbp+96]
+ vmovdqu YMMWORD PTR [rsi+64], ymm8
+ vmovdqu YMMWORD PTR [rsi+96], ymm9
+ vpaddq ymm8, ymm4, [rbp+128]
+ vpaddq ymm9, ymm5, [rbp+160]
+ vmovdqu YMMWORD PTR [rsi+128], ymm8
+ vmovdqu YMMWORD PTR [rsi+160], ymm9
+ vpaddq ymm8, ymm6, [rbp+192]
+ vpaddq ymm9, ymm7, [rbp+224]
+ vmovdqu YMMWORD PTR [rsi+192], ymm8
+ vmovdqu YMMWORD PTR [rsi+224], ymm9
+ ; msg_sched: 0-1
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr ymm12, ymm1, ymm0, 8
+ add r15, QWORD PTR [rsi]
+ mov rdx, r13
+ xor rcx, rax
+ vpalignr ymm13, ymm5, ymm4, 8
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm0, ymm13, ymm0
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq ymm0, ymm8, ymm0
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq ymm8, ymm7, 19
+ vpsllq ymm9, ymm7, 45
+ add r14, QWORD PTR [rsi+8]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq ymm10, ymm7, 61
+ vpsllq ymm11, ymm7, 3
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm7, 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq ymm0, ymm8, ymm0
+ ; msg_sched done: 0-1
+ ; msg_sched: 4-5
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr ymm12, ymm2, ymm1, 8
+ add r13, QWORD PTR [rsi+32]
+ mov rdx, r11
+ xor rcx, rax
+ vpalignr ymm13, ymm6, ymm5, 8
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm1, ymm13, ymm1
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq ymm1, ymm8, ymm1
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq ymm8, ymm0, 19
+ vpsllq ymm9, ymm0, 45
+ add r12, QWORD PTR [rsi+40]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq ymm10, ymm0, 61
+ vpsllq ymm11, ymm0, 3
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm0, 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq ymm1, ymm8, ymm1
+ ; msg_sched done: 4-5
+ ; msg_sched: 8-9
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr ymm12, ymm3, ymm2, 8
+ add r11, QWORD PTR [rsi+64]
+ mov rdx, r9
+ xor rcx, rax
+ vpalignr ymm13, ymm7, ymm6, 8
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm2, ymm13, ymm2
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq ymm2, ymm8, ymm2
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq ymm8, ymm1, 19
+ vpsllq ymm9, ymm1, 45
+ add r10, QWORD PTR [rsi+72]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq ymm10, ymm1, 61
+ vpsllq ymm11, ymm1, 3
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm1, 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq ymm2, ymm8, ymm2
+ ; msg_sched done: 8-9
+ ; msg_sched: 12-13
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr ymm12, ymm4, ymm3, 8
+ add r9, QWORD PTR [rsi+96]
+ mov rdx, r15
+ xor rcx, rax
+ vpalignr ymm13, ymm0, ymm7, 8
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm3, ymm13, ymm3
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq ymm3, ymm8, ymm3
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq ymm8, ymm2, 19
+ vpsllq ymm9, ymm2, 45
+ add r8, QWORD PTR [rsi+104]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq ymm10, ymm2, 61
+ vpsllq ymm11, ymm2, 3
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm2, 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq ymm3, ymm8, ymm3
+ ; msg_sched done: 12-13
+ ; msg_sched: 16-17
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ vpalignr ymm12, ymm5, ymm4, 8
+ add r15, QWORD PTR [rsi+128]
+ mov rdx, r13
+ xor rcx, rax
+ vpalignr ymm13, ymm1, ymm0, 8
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm4, ymm13, ymm4
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ vpaddq ymm4, ymm8, ymm4
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ vpsrlq ymm8, ymm3, 19
+ vpsllq ymm9, ymm3, 45
+ add r14, QWORD PTR [rsi+136]
+ mov rbx, r12
+ xor rcx, rax
+ vpsrlq ymm10, ymm3, 61
+ vpsllq ymm11, ymm3, 3
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm3, 6
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ vpaddq ymm4, ymm8, ymm4
+ ; msg_sched done: 16-17
+ ; msg_sched: 20-21
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ vpalignr ymm12, ymm6, ymm5, 8
+ add r13, QWORD PTR [rsi+160]
+ mov rdx, r11
+ xor rcx, rax
+ vpalignr ymm13, ymm2, ymm1, 8
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm5, ymm13, ymm5
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ vpaddq ymm5, ymm8, ymm5
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ vpsrlq ymm8, ymm4, 19
+ vpsllq ymm9, ymm4, 45
+ add r12, QWORD PTR [rsi+168]
+ mov rbx, r10
+ xor rcx, rax
+ vpsrlq ymm10, ymm4, 61
+ vpsllq ymm11, ymm4, 3
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm4, 6
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ vpaddq ymm5, ymm8, ymm5
+ ; msg_sched done: 20-21
+ ; msg_sched: 24-25
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ vpalignr ymm12, ymm7, ymm6, 8
+ add r11, QWORD PTR [rsi+192]
+ mov rdx, r9
+ xor rcx, rax
+ vpalignr ymm13, ymm3, ymm2, 8
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm6, ymm13, ymm6
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ vpaddq ymm6, ymm8, ymm6
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ vpsrlq ymm8, ymm5, 19
+ vpsllq ymm9, ymm5, 45
+ add r10, QWORD PTR [rsi+200]
+ mov rbx, r8
+ xor rcx, rax
+ vpsrlq ymm10, ymm5, 61
+ vpsllq ymm11, ymm5, 3
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm5, 6
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ vpaddq ymm6, ymm8, ymm6
+ ; msg_sched done: 24-25
+ ; msg_sched: 28-29
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ vpalignr ymm12, ymm0, ymm7, 8
+ add r9, QWORD PTR [rsi+224]
+ mov rdx, r15
+ xor rcx, rax
+ vpalignr ymm13, ymm4, ymm3, 8
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ vpsrlq ymm8, ymm12, 1
+ vpsllq ymm9, ymm12, 63
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ vpsrlq ymm10, ymm12, 8
+ vpsllq ymm11, ymm12, 56
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ vpsrlq ymm11, ymm12, 7
+ vpxor ymm8, ymm8, ymm10
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ vpxor ymm8, ymm8, ymm11
+ vpaddq ymm7, ymm13, ymm7
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ vpaddq ymm7, ymm8, ymm7
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ vpsrlq ymm8, ymm6, 19
+ vpsllq ymm9, ymm6, 45
+ add r8, QWORD PTR [rsi+232]
+ mov rbx, r14
+ xor rcx, rax
+ vpsrlq ymm10, ymm6, 61
+ vpsllq ymm11, ymm6, 3
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ vpor ymm8, ymm8, ymm9
+ vpor ymm10, ymm10, ymm11
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ vpxor ymm8, ymm8, ymm10
+ vpsrlq ymm11, ymm6, 6
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ vpxor ymm8, ymm8, ymm11
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ vpaddq ymm7, ymm8, ymm7
+ ; msg_sched done: 28-29
+ add rbp, 256
+ add rsi, 256
+ cmp rbp, QWORD PTR [L_avx2_rorx_sha512_k_2_end]
+ jne L_sha512_len_avx2_rorx_start
+ vpaddq ymm8, ymm0, [rbp]
+ vpaddq ymm9, ymm1, [rbp+32]
+ vmovdqu YMMWORD PTR [rsi], ymm8
+ vmovdqu YMMWORD PTR [rsi+32], ymm9
+ vpaddq ymm8, ymm2, [rbp+64]
+ vpaddq ymm9, ymm3, [rbp+96]
+ vmovdqu YMMWORD PTR [rsi+64], ymm8
+ vmovdqu YMMWORD PTR [rsi+96], ymm9
+ vpaddq ymm8, ymm4, [rbp+128]
+ vpaddq ymm9, ymm5, [rbp+160]
+ vmovdqu YMMWORD PTR [rsi+128], ymm8
+ vmovdqu YMMWORD PTR [rsi+160], ymm9
+ vpaddq ymm8, ymm6, [rbp+192]
+ vpaddq ymm9, ymm7, [rbp+224]
+ vmovdqu YMMWORD PTR [rsi+192], ymm8
+ vmovdqu YMMWORD PTR [rsi+224], ymm9
+ ; rnd_all_2: 0-1
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsi]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsi+8]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 4-5
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsi+32]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsi+40]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 8-9
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsi+64]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsi+72]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 12-13
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsi+96]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsi+104]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ ; rnd_all_2: 16-17
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsi+128]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsi+136]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 20-21
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsi+160]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsi+168]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 24-25
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsi+192]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsi+200]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 28-29
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsi+224]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsi+232]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ add r8, rdx
+ sub rsi, 1024
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ mov rbx, r9
+ xor rdx, rdx
+ xor rbx, r10
+ mov rbp, 5
+L_sha512_len_avx2_rorx_tail:
+ ; rnd_all_2: 2-3
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsi+16]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsi+24]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 6-7
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsi+48]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsi+56]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 10-11
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsi+80]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsi+88]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 14-15
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsi+112]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsi+120]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ ; rnd_all_2: 18-19
+ rorx rax, r12, 14
+ rorx rcx, r12, 18
+ add r8, rdx
+ add r15, QWORD PTR [rsi+144]
+ mov rdx, r13
+ xor rcx, rax
+ xor rdx, r14
+ rorx rax, r12, 41
+ xor rax, rcx
+ and rdx, r12
+ add r15, rax
+ rorx rax, r8, 28
+ rorx rcx, r8, 34
+ xor rdx, r14
+ xor rcx, rax
+ rorx rax, r8, 39
+ add r15, rdx
+ xor rax, rcx
+ mov rdx, r9
+ add r11, r15
+ xor rdx, r8
+ and rbx, rdx
+ add r15, rax
+ xor rbx, r9
+ rorx rax, r11, 14
+ rorx rcx, r11, 18
+ add r15, rbx
+ add r14, QWORD PTR [rsi+152]
+ mov rbx, r12
+ xor rcx, rax
+ xor rbx, r13
+ rorx rax, r11, 41
+ xor rax, rcx
+ and rbx, r11
+ add r14, rax
+ rorx rax, r15, 28
+ rorx rcx, r15, 34
+ xor rbx, r13
+ xor rcx, rax
+ rorx rax, r15, 39
+ add r14, rbx
+ xor rax, rcx
+ mov rbx, r8
+ lea r10, QWORD PTR [r10+r14]
+ xor rbx, r15
+ and rdx, rbx
+ add r14, rax
+ xor rdx, r8
+ ; rnd_all_2: 22-23
+ rorx rax, r10, 14
+ rorx rcx, r10, 18
+ add r14, rdx
+ add r13, QWORD PTR [rsi+176]
+ mov rdx, r11
+ xor rcx, rax
+ xor rdx, r12
+ rorx rax, r10, 41
+ xor rax, rcx
+ and rdx, r10
+ add r13, rax
+ rorx rax, r14, 28
+ rorx rcx, r14, 34
+ xor rdx, r12
+ xor rcx, rax
+ rorx rax, r14, 39
+ add r13, rdx
+ xor rax, rcx
+ mov rdx, r15
+ add r9, r13
+ xor rdx, r14
+ and rbx, rdx
+ add r13, rax
+ xor rbx, r15
+ rorx rax, r9, 14
+ rorx rcx, r9, 18
+ add r13, rbx
+ add r12, QWORD PTR [rsi+184]
+ mov rbx, r10
+ xor rcx, rax
+ xor rbx, r11
+ rorx rax, r9, 41
+ xor rax, rcx
+ and rbx, r9
+ add r12, rax
+ rorx rax, r13, 28
+ rorx rcx, r13, 34
+ xor rbx, r11
+ xor rcx, rax
+ rorx rax, r13, 39
+ add r12, rbx
+ xor rax, rcx
+ mov rbx, r14
+ lea r8, QWORD PTR [r8+r12]
+ xor rbx, r13
+ and rdx, rbx
+ add r12, rax
+ xor rdx, r14
+ ; rnd_all_2: 26-27
+ rorx rax, r8, 14
+ rorx rcx, r8, 18
+ add r12, rdx
+ add r11, QWORD PTR [rsi+208]
+ mov rdx, r9
+ xor rcx, rax
+ xor rdx, r10
+ rorx rax, r8, 41
+ xor rax, rcx
+ and rdx, r8
+ add r11, rax
+ rorx rax, r12, 28
+ rorx rcx, r12, 34
+ xor rdx, r10
+ xor rcx, rax
+ rorx rax, r12, 39
+ add r11, rdx
+ xor rax, rcx
+ mov rdx, r13
+ add r15, r11
+ xor rdx, r12
+ and rbx, rdx
+ add r11, rax
+ xor rbx, r13
+ rorx rax, r15, 14
+ rorx rcx, r15, 18
+ add r11, rbx
+ add r10, QWORD PTR [rsi+216]
+ mov rbx, r8
+ xor rcx, rax
+ xor rbx, r9
+ rorx rax, r15, 41
+ xor rax, rcx
+ and rbx, r15
+ add r10, rax
+ rorx rax, r11, 28
+ rorx rcx, r11, 34
+ xor rbx, r9
+ xor rcx, rax
+ rorx rax, r11, 39
+ add r10, rbx
+ xor rax, rcx
+ mov rbx, r12
+ lea r14, QWORD PTR [r14+r10]
+ xor rbx, r11
+ and rdx, rbx
+ add r10, rax
+ xor rdx, r12
+ ; rnd_all_2: 30-31
+ rorx rax, r14, 14
+ rorx rcx, r14, 18
+ add r10, rdx
+ add r9, QWORD PTR [rsi+240]
+ mov rdx, r15
+ xor rcx, rax
+ xor rdx, r8
+ rorx rax, r14, 41
+ xor rax, rcx
+ and rdx, r14
+ add r9, rax
+ rorx rax, r10, 28
+ rorx rcx, r10, 34
+ xor rdx, r8
+ xor rcx, rax
+ rorx rax, r10, 39
+ add r9, rdx
+ xor rax, rcx
+ mov rdx, r11
+ add r13, r9
+ xor rdx, r10
+ and rbx, rdx
+ add r9, rax
+ xor rbx, r11
+ rorx rax, r13, 14
+ rorx rcx, r13, 18
+ add r9, rbx
+ add r8, QWORD PTR [rsi+248]
+ mov rbx, r14
+ xor rcx, rax
+ xor rbx, r15
+ rorx rax, r13, 41
+ xor rax, rcx
+ and rbx, r13
+ add r8, rax
+ rorx rax, r9, 28
+ rorx rcx, r9, 34
+ xor rbx, r15
+ xor rcx, rax
+ rorx rax, r9, 39
+ add r8, rbx
+ xor rax, rcx
+ mov rbx, r10
+ lea r12, QWORD PTR [r12+r8]
+ xor rbx, r9
+ and rdx, rbx
+ add r8, rax
+ xor rdx, r10
+ add rsi, 256
+ sub rbp, 1
+ jnz L_sha512_len_avx2_rorx_tail
+ add r8, rdx
+ add r8, QWORD PTR [rdi]
+ add r9, QWORD PTR [rdi+8]
+ add r10, QWORD PTR [rdi+16]
+ add r11, QWORD PTR [rdi+24]
+ add r12, QWORD PTR [rdi+32]
+ add r13, QWORD PTR [rdi+40]
+ add r14, QWORD PTR [rdi+48]
+ add r15, QWORD PTR [rdi+56]
+ mov rax, QWORD PTR [rdi+224]
+ add rax, 256
+ sub DWORD PTR [rsp+1344], 256
+ mov QWORD PTR [rdi+224], rax
+ mov QWORD PTR [rdi], r8
+ mov QWORD PTR [rdi+8], r9
+ mov QWORD PTR [rdi+16], r10
+ mov QWORD PTR [rdi+24], r11
+ mov QWORD PTR [rdi+32], r12
+ mov QWORD PTR [rdi+40], r13
+ mov QWORD PTR [rdi+48], r14
+ mov QWORD PTR [rdi+56], r15
+ jnz L_sha512_len_avx2_rorx_begin
+ add rsp, 1352
+L_sha512_len_avx2_rorx_done:
+ xor rax, rax
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm14, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm12, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+Transform_Sha512_AVX2_RORX_Len ENDP
+_TEXT ENDS
+ENDIF
+END
diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S
index e1e77a93783..db09680752f 100644
--- a/wolfcrypt/src/wc_mldsa_asm.S
+++ b/wolfcrypt/src/wc_mldsa_asm.S
@@ -22755,7 +22755,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 32(%rsi)
# 3/32
- vperm2i128 $0x21, %ymm1, %ymm0, %ymm0
+ vperm2i128 $33, %ymm1, %ymm0, %ymm0
vpermq $0xe9, %ymm0, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22770,7 +22770,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 96(%rsi)
# 5/32
- vperm2i128 $0x21, %ymm2, %ymm1, %ymm1
+ vperm2i128 $33, %ymm2, %ymm1, %ymm1
vpermq $0x94, %ymm1, %ymm4
vpshufb %ymm9, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22792,7 +22792,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 192(%rsi)
# 8/32
- vperm2i128 $0x21, %ymm3, %ymm2, %ymm2
+ vperm2i128 $33, %ymm3, %ymm2, %ymm2
vpermq $0xe9, %ymm2, %ymm4
vpshufb %ymm12, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22809,7 +22809,7 @@ _wc_mldsa_decode_t0_avx2:
# 10/32
vmovdqu 128(%rdi), %ymm0
vmovdqu 160(%rdi), %ymm1
- vperm2i128 $0x21, %ymm0, %ymm3, %ymm3
+ vperm2i128 $33, %ymm0, %ymm3, %ymm3
vpermq $0x94, %ymm3, %ymm4
vpshufb %ymm6, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22831,7 +22831,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 352(%rsi)
# 13/32
- vperm2i128 $0x21, %ymm1, %ymm0, %ymm0
+ vperm2i128 $33, %ymm1, %ymm0, %ymm0
vpermq $0xe9, %ymm0, %ymm4
vpshufb %ymm9, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22848,7 +22848,7 @@ _wc_mldsa_decode_t0_avx2:
# 15/32
vmovdqu 192(%rdi), %ymm2
vmovdqu 224(%rdi), %ymm3
- vperm2i128 $0x21, %ymm2, %ymm1, %ymm1
+ vperm2i128 $33, %ymm2, %ymm1, %ymm1
vpermq $0x94, %ymm1, %ymm4
vpshufb %ymm11, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22870,7 +22870,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 512(%rsi)
# 18/32
- vperm2i128 $0x21, %ymm3, %ymm2, %ymm2
+ vperm2i128 $33, %ymm3, %ymm2, %ymm2
vpermq $0xe9, %ymm2, %ymm4
vpshufb %ymm6, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22887,7 +22887,7 @@ _wc_mldsa_decode_t0_avx2:
# 20/32
vmovdqu 256(%rdi), %ymm0
vmovdqu 288(%rdi), %ymm1
- vperm2i128 $0x21, %ymm0, %ymm3, %ymm3
+ vperm2i128 $33, %ymm0, %ymm3, %ymm3
vpermq $0x94, %ymm3, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22909,7 +22909,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 672(%rsi)
# 23/32
- vperm2i128 $0x21, %ymm1, %ymm0, %ymm0
+ vperm2i128 $33, %ymm1, %ymm0, %ymm0
vpermq $0xe9, %ymm0, %ymm4
vpshufb %ymm11, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22926,7 +22926,7 @@ _wc_mldsa_decode_t0_avx2:
# 25/32
vmovdqu 320(%rdi), %ymm2
vmovdqu 352(%rdi), %ymm3
- vperm2i128 $0x21, %ymm2, %ymm1, %ymm1
+ vperm2i128 $33, %ymm2, %ymm1, %ymm1
vpermq $0x99, %ymm1, %ymm4
vpshufb %ymm5, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22941,14 +22941,14 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 800(%rsi)
# 27/32
- vpermq $0x3e, %ymm2, %ymm4
+ vpermq $62, %ymm2, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
vpand %ymm14, %ymm4, %ymm4
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 832(%rsi)
# 28/32
- vperm2i128 $0x21, %ymm3, %ymm2, %ymm2
+ vperm2i128 $33, %ymm3, %ymm2, %ymm2
vpermq $0xe9, %ymm2, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22965,7 +22965,7 @@ _wc_mldsa_decode_t0_avx2:
# 30/32
vmovdqu 384(%rdi), %ymm0
vmovdqu 416(%rdi), %ymm1
- vperm2i128 $0x21, %ymm0, %ymm3, %ymm3
+ vperm2i128 $33, %ymm0, %ymm3, %ymm3
vpermq $0x99, %ymm3, %ymm4
vpshufb %ymm10, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
@@ -22980,7 +22980,7 @@ _wc_mldsa_decode_t0_avx2:
vpsubd %ymm4, %ymm15, %ymm4
vmovdqu %ymm4, 960(%rsi)
# 32/32
- vpermq $0x3e, %ymm0, %ymm4
+ vpermq $62, %ymm0, %ymm4
vpshufb %ymm12, %ymm4, %ymm4
vpsrlvd %ymm13, %ymm4, %ymm4
vpand %ymm14, %ymm4, %ymm4
@@ -23115,14 +23115,14 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 32(%rsi)
# 3/32
- vpermq $0x3e, %ymm0, %ymm4
+ vpermq $62, %ymm0, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
vpand %ymm10, %ymm4, %ymm4
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 64(%rsi)
# 4/32
- vperm2i128 $0x21, %ymm1, %ymm0, %ymm0
+ vperm2i128 $33, %ymm1, %ymm0, %ymm0
vpermq $0xe9, %ymm0, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23144,7 +23144,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 160(%rsi)
# 7/32
- vperm2i128 $0x21, %ymm2, %ymm1, %ymm1
+ vperm2i128 $33, %ymm2, %ymm1, %ymm1
vpermq $0xe9, %ymm1, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23166,7 +23166,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 256(%rsi)
# 10/32
- vperm2i128 $0x21, %ymm3, %ymm2, %ymm2
+ vperm2i128 $33, %ymm3, %ymm2, %ymm2
vpermq $0x99, %ymm2, %ymm4
vpshufb %ymm6, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23190,7 +23190,7 @@ _wc_mldsa_decode_t1_avx2:
# 13/32
vmovdqu 128(%rdi), %ymm0
vmovdqu 160(%rdi), %ymm1
- vperm2i128 $0x21, %ymm0, %ymm3, %ymm3
+ vperm2i128 $33, %ymm0, %ymm3, %ymm3
vpermq $0x99, %ymm3, %ymm4
vpshufb %ymm5, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23212,7 +23212,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 448(%rsi)
# 16/32
- vpermq $0x3e, %ymm0, %ymm4
+ vpermq $62, %ymm0, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
vpand %ymm10, %ymm4, %ymm4
@@ -23233,7 +23233,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 544(%rsi)
# 19/32
- vpermq $0x3e, %ymm1, %ymm4
+ vpermq $62, %ymm1, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
vpand %ymm10, %ymm4, %ymm4
@@ -23242,7 +23242,7 @@ _wc_mldsa_decode_t1_avx2:
# 20/32
vmovdqu 192(%rdi), %ymm2
vmovdqu 224(%rdi), %ymm3
- vperm2i128 $0x21, %ymm2, %ymm1, %ymm1
+ vperm2i128 $33, %ymm2, %ymm1, %ymm1
vpermq $0xe9, %ymm1, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23264,7 +23264,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 672(%rsi)
# 23/32
- vperm2i128 $0x21, %ymm3, %ymm2, %ymm2
+ vperm2i128 $33, %ymm3, %ymm2, %ymm2
vpermq $0xe9, %ymm2, %ymm4
vpshufb %ymm7, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23288,7 +23288,7 @@ _wc_mldsa_decode_t1_avx2:
# 26/32
vmovdqu 256(%rdi), %ymm0
vmovdqu 288(%rdi), %ymm1
- vperm2i128 $0x21, %ymm0, %ymm3, %ymm3
+ vperm2i128 $33, %ymm0, %ymm3, %ymm3
vpermq $0x99, %ymm3, %ymm4
vpshufb %ymm6, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23310,7 +23310,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 864(%rsi)
# 29/32
- vperm2i128 $0x21, %ymm1, %ymm0, %ymm0
+ vperm2i128 $33, %ymm1, %ymm0, %ymm0
vpermq $0x99, %ymm0, %ymm4
vpshufb %ymm5, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
@@ -23332,7 +23332,7 @@ _wc_mldsa_decode_t1_avx2:
vpslld $13, %ymm4, %ymm4
vmovdqu %ymm4, 960(%rsi)
# 32/32
- vpermq $0x3e, %ymm1, %ymm4
+ vpermq $62, %ymm1, %ymm4
vpshufb %ymm8, %ymm4, %ymm4
vpsrlvd %ymm9, %ymm4, %ymm4
vpand %ymm10, %ymm4, %ymm4
diff --git a/wolfcrypt/src/wc_mldsa_asm.asm b/wolfcrypt/src/wc_mldsa_asm.asm
new file mode 100644
index 00000000000..16ad868c23f
--- /dev/null
+++ b/wolfcrypt/src/wc_mldsa_asm.asm
@@ -0,0 +1,34618 @@
+; /* wc_mldsa_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+IFDEF WOLFSSL_HAVE_MLDSA
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+mldsa_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ptr_mldsa_q QWORD mldsa_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mldsa_qinv DWORD 03802001h, 03802001h, 03802001h, 03802001h
+ DWORD 03802001h, 03802001h, 03802001h, 03802001h
+ptr_mldsa_qinv QWORD mldsa_qinv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mldsa_v DWORD 00400000h, 00400000h, 00400000h, 00400000h
+ DWORD 00400000h, 00400000h, 00400000h, 00400000h
+ptr_mldsa_v QWORD mldsa_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_avx2_zetas DWORD 000064f7h, 000064f7h, 000064f7h, 000064f7h
+ DWORD 000064f7h, 000064f7h, 000064f7h, 000064f7h
+ DWORD 6d1f44f7h, 6d1f44f7h, 6d1f44f7h, 6d1f44f7h
+ DWORD 6d1f44f7h, 6d1f44f7h, 6d1f44f7h, 6d1f44f7h
+ DWORD 0ffd83102h, 0ffd83102h, 0ffd83102h, 0ffd83102h
+ DWORD 0ffd83102h, 0ffd83102h, 0ffd83102h, 0ffd83102h
+ DWORD 8cf87102h, 8cf87102h, 8cf87102h, 8cf87102h
+ DWORD 8cf87102h, 8cf87102h, 8cf87102h, 8cf87102h
+ DWORD 0fff81503h, 0fff81503h, 0fff81503h, 0fff81503h
+ DWORD 0fff81503h, 0fff81503h, 0fff81503h, 0fff81503h
+ DWORD 8d187503h, 8d187503h, 8d187503h, 8d187503h
+ DWORD 8d187503h, 8d187503h, 8d187503h, 8d187503h
+ DWORD 00039e44h, 00039e44h, 00039e44h, 00039e44h
+ DWORD 00039e44h, 00039e44h, 00039e44h, 00039e44h
+ DWORD 61cc1e44h, 61cc1e44h, 61cc1e44h, 61cc1e44h
+ DWORD 61cc1e44h, 61cc1e44h, 61cc1e44h, 61cc1e44h
+ DWORD 001bde2bh, 001bde2bh, 001bde2bh, 001bde2bh
+ DWORD 001bde2bh, 001bde2bh, 001bde2bh, 001bde2bh
+ DWORD 12613e2bh, 12613e2bh, 12613e2bh, 12613e2bh
+ DWORD 12613e2bh, 12613e2bh, 12613e2bh, 12613e2bh
+ DWORD 0023e92bh, 0023e92bh, 0023e92bh, 0023e92bh
+ DWORD 0023e92bh, 0023e92bh, 0023e92bh, 0023e92bh
+ DWORD 93c9492bh, 93c9492bh, 93c9492bh, 93c9492bh
+ DWORD 93c9492bh, 93c9492bh, 93c9492bh, 93c9492bh
+ DWORD 00299658h, 00299658h, 00299658h, 00299658h
+ DWORD 00299658h, 00299658h, 00299658h, 00299658h
+ DWORD 66f49658h, 66f49658h, 66f49658h, 66f49658h
+ DWORD 66f49658h, 66f49658h, 66f49658h, 66f49658h
+ DWORD 000fa070h, 000fa070h, 000fa070h, 000fa070h
+ DWORD 000fa070h, 000fa070h, 000fa070h, 000fa070h
+ DWORD 7c1da070h, 7c1da070h, 7c1da070h, 7c1da070h
+ DWORD 7c1da070h, 7c1da070h, 7c1da070h, 7c1da070h
+ DWORD 0ffef85a4h, 0ffef85a4h, 0ffef85a4h, 0ffef85a4h
+ DWORD 0ffef85a4h, 0ffef85a4h, 0ffef85a4h, 0ffef85a4h
+ DWORD 0aea405a4h, 0aea405a4h, 0aea405a4h, 0aea405a4h
+ DWORD 0aea405a4h, 0aea405a4h, 0aea405a4h, 0aea405a4h
+ DWORD 0036b788h, 0036b788h, 0036b788h, 0036b788h
+ DWORD 0036b788h, 0036b788h, 0036b788h, 0036b788h
+ DWORD 3327b788h, 3327b788h, 3327b788h, 3327b788h
+ DWORD 3327b788h, 3327b788h, 3327b788h, 3327b788h
+ DWORD 00294a67h, 00294a67h, 00294a67h, 00294a67h
+ DWORD 00017620h, 00017620h, 00017620h, 00017620h
+ DWORD 91f62a67h, 91f62a67h, 91f62a67h, 91f62a67h
+ DWORD 9ec57620h, 9ec57620h, 9ec57620h, 9ec57620h
+ DWORD 002ef4cdh, 002ef4cdh, 002ef4cdh, 002ef4cdh
+ DWORD 0035dec5h, 0035dec5h, 0035dec5h, 0035dec5h
+ DWORD 0ac4894cdh, 0ac4894cdh, 0ac4894cdh, 0ac4894cdh
+ DWORD 6d8e7ec5h, 6d8e7ec5h, 6d8e7ec5h, 6d8e7ec5h
+ DWORD 0ffc406e5h, 0ffc406e5h, 0ffe8ac81h, 0ffe8ac81h
+ DWORD 0ffc7e1cfh, 0ffc7e1cfh, 0ffd19819h, 0ffd19819h
+ DWORD 0a220a6e5h, 0a220a6e5h, 0d8f8cc81h, 0d8f8cc81h
+ DWORD 5081c1cfh, 5081c1cfh, 8a54b819h, 8a54b819h
+ DWORD 0ffe9d65dh, 0ffe9d65dh, 003509eeh, 003509eeh
+ DWORD 002135c7h, 002135c7h, 0ffe7cfbbh, 0ffe7cfbbh
+ DWORD 8035765dh, 8035765dh, 6272c9eeh, 6272c9eeh
+ DWORD 5f5a15c7h, 5f5a15c7h, 085f2fbbh, 085f2fbbh
+ DWORD 0ffe6a503h, 0ffe6a503h, 0ffe6a503h, 0ffe6a503h
+ DWORD 0ffc9302ch, 0ffc9302ch, 0ffc9302ch, 0ffc9302ch
+ DWORD 5f070503h, 5f070503h, 5f070503h, 5f070503h
+ DWORD 0bfceb02ch, 0bfceb02ch, 0bfceb02ch, 0bfceb02ch
+ DWORD 0ffd947d4h, 0ffd947d4h, 0ffd947d4h, 0ffd947d4h
+ DWORD 003bbeafh, 003bbeafh, 003bbeafh, 003bbeafh
+ DWORD 8ed3c7d4h, 8ed3c7d4h, 8ed3c7d4h, 8ed3c7d4h
+ DWORD 0dc919eafh, 0dc919eafh, 0dc919eafh, 0dc919eafh
+ DWORD 0ffeccf75h, 0ffeccf75h, 001d9772h, 001d9772h
+ DWORD 0ffc1b072h, 0ffc1b072h, 0fff0bcf6h, 0fff0bcf6h
+ DWORD 0b35b6f75h, 0b35b6f75h, 0c20bd772h, 0c20bd772h
+ DWORD 0c4cff072h, 0c4cff072h, 748f7cf6h, 748f7cf6h
+ DWORD 0ffcf5280h, 0ffcf5280h, 0ffcfd2aeh, 0ffcfd2aeh
+ DWORD 0ffc890e0h, 0ffc890e0h, 0001efcah, 0001efcah
+ DWORD 0aa1f5280h, 0aa1f5280h, 5b2592aeh, 5b2592aeh
+ DWORD 21e490e0h, 21e490e0h, 80fb2fcah, 80fb2fcah
+ DWORD 001fea93h, 0033ff5ah, 002358d4h, 003a41f8h
+ DWORD 0ffccff72h, 00223dfbh, 0ffdaab9fh, 0ffc9a422h
+ DWORD 0fff24a93h, 3b1f3f5ah, 513dd8d4h, 2c7941f8h
+ DWORD 0aebb3f72h, 36619dfbh, 01ce8b9fh, 0ab4de422h
+ DWORD 000412f5h, 00252587h, 0ffed24f0h, 00359b5dh
+ DWORD 0ffca48a0h, 0ffc6a2fch, 0ffedbb56h, 0ffcf45deh
+ DWORD 0dbe2b2f5h, 0fd560587h, 0ec8b24f0h, 79213b5dh
+ DWORD 78de48a0h, 462622fch, 64587b56h, 718b05deh
+ DWORD 000dbe5eh, 001c5e1ah, 000de0e6h, 000c7f5ah
+ DWORD 00078f83h, 0ffe7628ah, 0ffff5704h, 0fff806fch
+ DWORD 00d97e5eh, 0e6df9e1ah, 0e12aa0e6h, 4af7bf5ah
+ DWORD 3c77ef83h, 0cf38a28ah, 78dfd704h, 72d786fch
+ DWORD 0fff60021h, 0ffd05af6h, 001f0084h, 0030ef86h
+ DWORD 0ffc9b97dh, 0fff7fcd6h, 0fff44592h, 0ffc921c2h
+ DWORD 337a2021h, 682f1af6h, 0ae2f8084h, 7321af86h
+ DWORD 6c79597dh, 0ec92bcd6h, 07a68592h, 4b0161c2h
+ DWORD 0fff42118h, 0fff42118h, 0fff42118h, 0fff42118h
+ DWORD 0fff42118h, 0fff42118h, 0fff42118h, 0fff42118h
+ DWORD 58172118h, 58172118h, 58172118h, 58172118h
+ DWORD 58172118h, 58172118h, 58172118h, 58172118h
+ DWORD 0fffa84adh, 0fffa84adh, 0fffa84adh, 0fffa84adh
+ DWORD 0fffa84adh, 0fffa84adh, 0fffa84adh, 0fffa84adh
+ DWORD 0ae1024adh, 0ae1024adh, 0ae1024adh, 0ae1024adh
+ DWORD 0ae1024adh, 0ae1024adh, 0ae1024adh, 0ae1024adh
+ DWORD 0ffe0147fh, 0ffe0147fh, 0ffe0147fh, 0ffe0147fh
+ DWORD 0ffe0147fh, 0ffe0147fh, 0ffe0147fh, 0ffe0147fh
+ DWORD 0beeff47fh, 0beeff47fh, 0beeff47fh, 0beeff47fh
+ DWORD 0beeff47fh, 0beeff47fh, 0beeff47fh, 0beeff47fh
+ DWORD 0fff79d90h, 0fff79d90h, 0fff79d90h, 0fff79d90h
+ DWORD 0fff79d90h, 0fff79d90h, 0fff79d90h, 0fff79d90h
+ DWORD 6ba99d90h, 6ba99d90h, 6ba99d90h, 6ba99d90h
+ DWORD 6ba99d90h, 6ba99d90h, 6ba99d90h, 6ba99d90h
+ DWORD 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h
+ DWORD 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h
+ DWORD 0d42eaa0h, 0d42eaa0h, 0d42eaa0h, 0d42eaa0h
+ DWORD 0d42eaa0h, 0d42eaa0h, 0d42eaa0h, 0d42eaa0h
+ DWORD 0027f968h, 0027f968h, 0027f968h, 0027f968h
+ DWORD 0027f968h, 0027f968h, 0027f968h, 0027f968h
+ DWORD 0eb54f968h, 0eb54f968h, 0eb54f968h, 0eb54f968h
+ DWORD 0eb54f968h, 0eb54f968h, 0eb54f968h, 0eb54f968h
+ DWORD 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh
+ DWORD 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh
+ DWORD 28cf337bh, 28cf337bh, 28cf337bh, 28cf337bh
+ DWORD 28cf337bh, 28cf337bh, 28cf337bh, 28cf337bh
+ DWORD 0ffc51585h, 0ffc51585h, 0ffc51585h, 0ffc51585h
+ DWORD 0ffd18e7ch, 0ffd18e7ch, 0ffd18e7ch, 0ffd18e7ch
+ DWORD 0f3f5b585h, 0f3f5b585h, 0f3f5b585h, 0f3f5b585h
+ DWORD 0e3a10e7ch, 0e3a10e7ch, 0e3a10e7ch, 0e3a10e7ch
+ DWORD 00368a96h, 00368a96h, 00368a96h, 00368a96h
+ DWORD 0ffd43e41h, 0ffd43e41h, 0ffd43e41h, 0ffd43e41h
+ DWORD 0de894a96h, 0de894a96h, 0de894a96h, 0de894a96h
+ DWORD 6b1c5e41h, 6b1c5e41h, 6b1c5e41h, 6b1c5e41h
+ DWORD 003410f2h, 003410f2h, 0fff0fe85h, 0fff0fe85h
+ DWORD 0020c638h, 0020c638h, 00296e9fh, 00296e9fh
+ DWORD 0d15250f2h, 0d15250f2h, 0f1419e85h, 0f1419e85h
+ DWORD 0dce7c638h, 0dce7c638h, 5a7d4e9fh, 5a7d4e9fh
+ DWORD 0ffd2b7a3h, 0ffd2b7a3h, 0ffc7a44bh, 0ffc7a44bh
+ DWORD 0fff9ba6dh, 0fff9ba6dh, 0ffda3409h, 0ffda3409h
+ DWORD 114717a3h, 114717a3h, 0fad1044bh, 0fad1044bh
+ DWORD 0b4c75a6dh, 0b4c75a6dh, 65db5409h, 65db5409h
+ DWORD 00360400h, 00360400h, 00360400h, 00360400h
+ DWORD 0fffb6a4dh, 0fffb6a4dh, 0fffb6a4dh, 0fffb6a4dh
+ DWORD 0c0b60400h, 0c0b60400h, 0c0b60400h, 0c0b60400h
+ DWORD 7ac50a4dh, 7ac50a4dh, 7ac50a4dh, 7ac50a4dh
+ DWORD 0023d69ch, 0023d69ch, 0023d69ch, 0023d69ch
+ DWORD 0fff7c55dh, 0fff7c55dh, 0fff7c55dh, 0fff7c55dh
+ DWORD 9cf7569ch, 9cf7569ch, 9cf7569ch, 9cf7569ch
+ DWORD 0be23655dh, 0be23655dh, 0be23655dh, 0be23655dh
+ DWORD 0fff5c282h, 0fff5c282h, 0ffed4113h, 0ffed4113h
+ DWORD 0ffffa63bh, 0ffffa63bh, 0ffec09f7h, 0ffec09f7h
+ DWORD 7f460282h, 7f460282h, 6a8fa113h, 6a8fa113h
+ DWORD 0c347063bh, 0c347063bh, 61aae9f7h, 61aae9f7h
+ DWORD 0fffa2bddh, 0fffa2bddh, 001495d4h, 001495d4h
+ DWORD 001c4563h, 001c4563h, 0ffea2c62h, 0ffea2c62h
+ DWORD 0caf5cbddh, 0caf5cbddh, 0f8cf15d4h, 0f8cf15d4h
+ DWORD 6348a563h, 6348a563h, 9c766c62h, 9c766c62h
+ DWORD 00053919h, 0004610ch, 0ffdacd41h, 003eb01bh
+ DWORD 003472e7h, 0ffcd003bh, 001a7cc7h, 00031924h
+ DWORD 7ea85919h, 3625e10ch, 0bd02ed41h, 34c2101bh
+ DWORD 0b71152e7h, 6e54603bh, 08335cc7h, 61279924h
+ DWORD 002b5ee5h, 00291199h, 0ffd87a3ah, 00134d71h
+ DWORD 003de11ch, 00130984h, 0025f051h, 00185a46h
+ DWORD 8d87fee5h, 0b9dc3199h, 0da1fba3ah, 75416d71h
+ DWORD 9e61611ch, 0af438984h, 0d9b01051h, 00611a46h
+ DWORD 0ffc68518h, 001314beh, 00283891h, 0ffc9db90h
+ DWORD 0ffd25089h, 001c853fh, 001d0b4bh, 0ffeff6a6h
+ DWORD 0a4698518h, 0fbaad4beh, 02ba5891h, 0b33bdb90h
+ DWORD 29637089h, 0ed44653fh, 28066b4bh, 43c4b6a6h
+ DWORD 0ffeba8beh, 0012e11bh, 0ffcd5e3eh, 0ffea2d2fh
+ DWORD 0fff91de4h, 001406c7h, 00327283h, 0ffe20d6eh
+ DWORD 0e0368beh, 3ab6411bh, 84951e3eh, 6a100d2fh
+ DWORD 0c1b59de4h, 396ce6c7h, 1902d283h, 428fcd6eh
+ DWORD 0fff2a128h, 0fff2a128h, 0fff2a128h, 0fff2a128h
+ DWORD 0fff2a128h, 0fff2a128h, 0fff2a128h, 0fff2a128h
+ DWORD 6017a128h, 6017a128h, 6017a128h, 6017a128h
+ DWORD 6017a128h, 6017a128h, 6017a128h, 6017a128h
+ DWORD 002f9a75h, 002f9a75h, 002f9a75h, 002f9a75h
+ DWORD 002f9a75h, 002f9a75h, 002f9a75h, 002f9a75h
+ DWORD 8cfe3a75h, 8cfe3a75h, 8cfe3a75h, 8cfe3a75h
+ DWORD 8cfe3a75h, 8cfe3a75h, 8cfe3a75h, 8cfe3a75h
+ DWORD 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h
+ DWORD 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h
+ DWORD 1eb51b09h, 1eb51b09h, 1eb51b09h, 1eb51b09h
+ DWORD 1eb51b09h, 1eb51b09h, 1eb51b09h, 1eb51b09h
+ DWORD 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h
+ DWORD 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h
+ DWORD 629a6dd6h, 629a6dd6h, 629a6dd6h, 629a6dd6h
+ DWORD 629a6dd6h, 629a6dd6h, 629a6dd6h, 629a6dd6h
+ DWORD 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h
+ DWORD 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h
+ DWORD 0cba1fae7h, 0cba1fae7h, 0cba1fae7h, 0cba1fae7h
+ DWORD 0cba1fae7h, 0cba1fae7h, 0cba1fae7h, 0cba1fae7h
+ DWORD 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h
+ DWORD 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h
+ DWORD 0b50984f7h, 0b50984f7h, 0b50984f7h, 0b50984f7h
+ DWORD 0b50984f7h, 0b50984f7h, 0b50984f7h, 0b50984f7h
+ DWORD 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h
+ DWORD 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h
+ DWORD 0d360fc98h, 0d360fc98h, 0d360fc98h, 0d360fc98h
+ DWORD 0d360fc98h, 0d360fc98h, 0d360fc98h, 0d360fc98h
+ DWORD 0ffe6123dh, 0ffe6123dh, 0ffe6123dh, 0ffe6123dh
+ DWORD 0ffe6ead6h, 0ffe6ead6h, 0ffe6ead6h, 0ffe6ead6h
+ DWORD 97adb23dh, 97adb23dh, 97adb23dh, 97adb23dh
+ DWORD 0ca41aad6h, 0ca41aad6h, 0ca41aad6h, 0ca41aad6h
+ DWORD 00357e1eh, 00357e1eh, 00357e1eh, 00357e1eh
+ DWORD 0ffc5af59h, 0ffc5af59h, 0ffc5af59h, 0ffc5af59h
+ DWORD 18f93e1eh, 18f93e1eh, 18f93e1eh, 18f93e1eh
+ DWORD 6d30cf59h, 6d30cf59h, 6d30cf59h, 6d30cf59h
+ DWORD 0ffccfbe9h, 0ffccfbe9h, 00040af0h, 00040af0h
+ DWORD 0007c417h, 0007c417h, 002f4588h, 002f4588h
+ DWORD 4eca1be9h, 4eca1be9h, 0c9620af0h, 0c9620af0h
+ DWORD 490aa417h, 490aa417h, 44e04588h, 44e04588h
+ DWORD 0000ad00h, 0000ad00h, 0ffef36beh, 0ffef36beh
+ DWORD 000dcd44h, 000dcd44h, 003c675ah, 003c675ah
+ DWORD 95a0ad00h, 95a0ad00h, 7fc6f6beh, 7fc6f6beh
+ DWORD 27b64d44h, 27b64d44h, 4827a75ah, 4827a75ah
+ DWORD 0035843fh, 0035843fh, 0035843fh, 0035843fh
+ DWORD 0ffdf5617h, 0ffdf5617h, 0ffdf5617h, 0ffdf5617h
+ DWORD 8d3d643fh, 8d3d643fh, 8d3d643fh, 8d3d643fh
+ DWORD 3b223617h, 3b223617h, 3b223617h, 3b223617h
+ DWORD 0ffe7945ch, 0ffe7945ch, 0ffe7945ch, 0ffe7945ch
+ DWORD 0038738ch, 0038738ch, 0038738ch, 0038738ch
+ DWORD 3473145ch, 3473145ch, 3473145ch, 3473145ch
+ DWORD 78a9f38ch, 78a9f38ch, 78a9f38ch, 78a9f38ch
+ DWORD 0ffc72bcah, 0ffc72bcah, 0ffffde7eh, 0ffffde7eh
+ DWORD 00193948h, 00193948h, 0ffce69c0h, 0ffce69c0h
+ DWORD 28406bcah, 28406bcah, 0b4cf9e7eh, 0b4cf9e7eh
+ DWORD 0a3423948h, 0a3423948h, 0ed0669c0h, 0ed0669c0h
+ DWORD 0024756ch, 0024756ch, 0fffcc7dfh, 0fffcc7dfh
+ DWORD 000b98a1h, 000b98a1h, 0ffebe808h, 0ffebe808h
+ DWORD 88d1f56ch, 88d1f56ch, 2578a7dfh, 2578a7dfh
+ DWORD 0a69fb8a1h, 0a69fb8a1h, 98ece808h, 98ece808h
+ DWORD 0ffec7953h, 001d4099h, 0ffd92578h, 0ffeb05adh
+ DWORD 0016e405h, 000bdbe7h, 00221de8h, 0033f8cfh
+ DWORD 3196d953h, 0bfb06099h, 48882578h, 3e20a5adh
+ DWORD 0ee178405h, 2408bbe7h, 0efdf1de8h, 53cdd8cfh
+ DWORD 0fff7b934h, 0ffd4ca0ch, 0ffe67ff8h, 0ffe3d157h
+ DWORD 0ffd8911bh, 0ffc72c12h, 000910d8h, 0ffc65e1fh
+ DWORD 2d1e3934h, 0c3164a0ch, 0b3e57ff8h, 2a8eb157h
+ DWORD 0f07bf11bh, 24496c12h, 162410d8h, 380a3e1fh
+ DWORD 0ffe14658h, 00251d8bh, 002573b7h, 0fffd7c8fh
+ DWORD 001ddd98h, 00336898h, 0002d4bbh, 0ffed93a7h
+ DWORD 5cac4658h, 0a567d8bh, 0af1c53b7h, 0a40f5c8fh
+ DWORD 4fd0dd98h, 81466898h, 0e91a34bbh, 7ae273a7h
+ DWORD 0ffcf6cbeh, 00027c1ch, 0018aa08h, 002dfd71h
+ DWORD 000c5ca5h, 0019379ah, 0ffc7a167h, 0ffe48c3dh
+ DWORD 86672cbeh, 0b185fc1ch, 3159aa08h, 0cb5c1d71h
+ DWORD 0cd20fca5h, 0c20c779ah, 0dc748167h, 66ec2c3dh
+ DWORD 00071e24h, 00071e24h, 00071e24h, 00071e24h
+ DWORD 00071e24h, 00071e24h, 00071e24h, 00071e24h
+ DWORD 61cb9e24h, 61cb9e24h, 61cb9e24h, 61cb9e24h
+ DWORD 61cb9e24h, 61cb9e24h, 61cb9e24h, 61cb9e24h
+ DWORD 002f7a49h, 002f7a49h, 002f7a49h, 002f7a49h
+ DWORD 002f7a49h, 002f7a49h, 002f7a49h, 002f7a49h
+ DWORD 0eef89a49h, 0eef89a49h, 0eef89a49h, 0eef89a49h
+ DWORD 0eef89a49h, 0eef89a49h, 0eef89a49h, 0eef89a49h
+ DWORD 0028e527h, 0028e527h, 0028e527h, 0028e527h
+ DWORD 0028e527h, 0028e527h, 0028e527h, 0028e527h
+ DWORD 254dc527h, 254dc527h, 254dc527h, 254dc527h
+ DWORD 254dc527h, 254dc527h, 254dc527h, 254dc527h
+ DWORD 001ad035h, 001ad035h, 001ad035h, 001ad035h
+ DWORD 001ad035h, 001ad035h, 001ad035h, 001ad035h
+ DWORD 13a17035h, 13a17035h, 13a17035h, 13a17035h
+ DWORD 13a17035h, 13a17035h, 13a17035h, 13a17035h
+ DWORD 0ffffb422h, 0ffffb422h, 0ffffb422h, 0ffffb422h
+ DWORD 0ffffb422h, 0ffffb422h, 0ffffb422h, 0ffffb422h
+ DWORD 6d83f422h, 6d83f422h, 6d83f422h, 6d83f422h
+ DWORD 6d83f422h, 6d83f422h, 6d83f422h, 6d83f422h
+ DWORD 003d3201h, 003d3201h, 003d3201h, 003d3201h
+ DWORD 003d3201h, 003d3201h, 003d3201h, 003d3201h
+ DWORD 0a9fd5201h, 0a9fd5201h, 0a9fd5201h, 0a9fd5201h
+ DWORD 0a9fd5201h, 0a9fd5201h, 0a9fd5201h, 0a9fd5201h
+ DWORD 000445c5h, 000445c5h, 000445c5h, 000445c5h
+ DWORD 000445c5h, 000445c5h, 000445c5h, 000445c5h
+ DWORD 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h
+ DWORD 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h
+ DWORD 000c63a8h, 000c63a8h, 000c63a8h, 000c63a8h
+ DWORD 00081b9ah, 00081b9ah, 00081b9ah, 00081b9ah
+ DWORD 588163a8h, 588163a8h, 588163a8h, 588163a8h
+ DWORD 9e7b5b9ah, 9e7b5b9ah, 9e7b5b9ah, 9e7b5b9ah
+ DWORD 000e8f76h, 000e8f76h, 000e8f76h, 000e8f76h
+ DWORD 003b3853h, 003b3853h, 003b3853h, 003b3853h
+ DWORD 0eefd4f76h, 0eefd4f76h, 0eefd4f76h, 0eefd4f76h
+ DWORD 89c59853h, 89c59853h, 89c59853h, 89c59853h
+ DWORD 0002e46ch, 0002e46ch, 0ffc9c808h, 0ffc9c808h
+ DWORD 003036c2h, 003036c2h, 0ffe3bff6h, 0ffe3bff6h
+ DWORD 0d690646ch, 0d690646ch, 54cac808h, 54cac808h
+ DWORD 0ae0876c2h, 0ae0876c2h, 54e27ff6h, 54e27ff6h
+ DWORD 0ffdb3c93h, 0ffdb3c93h, 0fffd4ae0h, 0fffd4ae0h
+ DWORD 00141305h, 00141305h, 00147792h, 00147792h
+ DWORD 69ed9c93h, 69ed9c93h, 0b9594ae0h, 0b9594ae0h
+ DWORD 13f4b305h, 13f4b305h, 0e06b792h, 0e06b792h
+ DWORD 003b8534h, 003b8534h, 003b8534h, 003b8534h
+ DWORD 0ffd8fc30h, 0ffd8fc30h, 0ffd8fc30h, 0ffd8fc30h
+ DWORD 0a6e20534h, 0a6e20534h, 0a6e20534h, 0a6e20534h
+ DWORD 0c75efc30h, 0c75efc30h, 0c75efc30h, 0c75efc30h
+ DWORD 001f9d54h, 001f9d54h, 001f9d54h, 001f9d54h
+ DWORD 0ffd54f2dh, 0ffd54f2dh, 0ffd54f2dh, 0ffd54f2dh
+ DWORD 99ca1d54h, 99ca1d54h, 99ca1d54h, 99ca1d54h
+ DWORD 0c73aef2dh, 0c73aef2dh, 0c73aef2dh, 0c73aef2dh
+ DWORD 00139e25h, 00139e25h, 0ffe7d0e0h, 0ffe7d0e0h
+ DWORD 0fff39944h, 0fff39944h, 0ffea0802h, 0ffea0802h
+ DWORD 0f5583e25h, 0f5583e25h, 0a03d0e0h, 0a03d0e0h
+ DWORD 0e11c1944h, 0e11c1944h, 47ea4802h, 47ea4802h
+ DWORD 0ffd1eea2h, 0ffd1eea2h, 0ffc4c79ch, 0ffc4c79ch
+ DWORD 0ffc8a057h, 0ffc8a057h, 003a97d9h, 003a97d9h
+ DWORD 74a62ea2h, 74a62ea2h, 3ab8479ch, 3ab8479ch
+ DWORD 44538057h, 44538057h, 0cab5b7d9h, 0cab5b7d9h
+ DWORD 0ffd1a13ch, 0035c539h, 003b0115h, 00041dc0h
+ DWORD 0021c4f7h, 0fff11bf4h, 001a35e7h, 0007340eh
+ DWORD 85f9213ch, 005ce539h, 29dda115h, 0a3bc1dc0h
+ DWORD 9940a4f7h, 0f96f9bf4h, 0ef5715e7h, 1788f40eh
+ DWORD 0fff97d45h, 001a4cd0h, 0ffe47caeh, 001d2668h
+ DWORD 0ffe68e98h, 0ffef2633h, 0fffc05dah, 0ffc57fdbh
+ DWORD 0a1221d45h, 21b44cd0h, 0f07a3caeh, 10ea2668h
+ DWORD 0e5b98e98h, 97358633h, 0fbb745dah, 2e40dfdbh
+ DWORD 0ffd32764h, 0ffdde1afh, 0fff993ddh, 0ffdd1d09h
+ DWORD 0002cc93h, 0fff11805h, 00189c2ah, 0ffc9e5a9h
+ DWORD 42bfa764h, 0a093c1afh, 0b7f533ddh, 42fe3d09h
+ DWORD 5c152c93h, 3471b805h, 0a69ddc2ah, 0bff05a9h
+ DWORD 0fff78a50h, 003bcf2ch, 0ffff434eh, 0ffeb36dfh
+ DWORD 003c15cah, 00155e68h, 0fff316b6h, 001e29ceh
+ DWORD 09418a50h, 94214f2ch, 7969034eh, 734716dfh
+ DWORD 0c5f555cah, 17e25e68h, 0dfc9d6b6h, 1657e9ceh
+ptr_L_mldsa_avx2_zetas QWORD L_mldsa_avx2_zetas
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_avx2_zetas_basemul DWORD 0ffc406e5h, 0ffe9d65dh, 003cf91bh, 001729a3h
+ DWORD 0ffe8ac81h, 003509eeh, 0018537fh, 0ffcbf612h
+ DWORD 0ffc7e1cfh, 002135c7h, 00391e31h, 0ffdfca39h
+ DWORD 0ffd19819h, 0ffe7cfbbh, 002f67e7h, 00193045h
+ DWORD 0000a6e5h, 0000765dh, 0000591bh, 000089a3h
+ DWORD 0000cc81h, 0000c9eeh, 0000337fh, 00003612h
+ DWORD 0000c1cfh, 000015c7h, 00003e31h, 0000ea39h
+ DWORD 0000b819h, 00002fbbh, 000047e7h, 0000d045h
+ DWORD 0ffeccf75h, 0ffcf5280h, 0014308bh, 0031ad80h
+ DWORD 001d9772h, 0ffcfd2aeh, 0ffe3688eh, 00312d52h
+ DWORD 0ffc1b072h, 0ffc890e0h, 003f4f8eh, 00386f20h
+ DWORD 0fff0bcf6h, 0001efcah, 0010430ah, 0ffff1036h
+ DWORD 00006f75h, 00005280h, 0000908bh, 0000ad80h
+ DWORD 0000d772h, 000092aeh, 0000288eh, 00006d52h
+ DWORD 0000f072h, 000090e0h, 00000f8eh, 00006f20h
+ DWORD 00007cf6h, 00002fcah, 0000830ah, 0000d036h
+ DWORD 003410f2h, 0ffd2b7a3h, 0ffccef0eh, 002e485dh
+ DWORD 0fff0fe85h, 0ffc7a44bh, 0010017bh, 00395bb5h
+ DWORD 0020c638h, 0fff9ba6dh, 0ffe039c8h, 00074593h
+ DWORD 00296e9fh, 0ffda3409h, 0ffd79161h, 0026cbf7h
+ DWORD 000050f2h, 000017a3h, 0000af0eh, 0000e85dh
+ DWORD 00009e85h, 0000044bh, 0000617bh, 0000fbb5h
+ DWORD 0000c638h, 00005a6dh, 000039c8h, 0000a593h
+ DWORD 00004e9fh, 00005409h, 0000b161h, 0000abf7h
+ DWORD 0fff5c282h, 0fffa2bddh, 000b3d7eh, 0006d423h
+ DWORD 0ffed4113h, 001495d4h, 0013beedh, 0ffec6a2ch
+ DWORD 0ffffa63bh, 001c4563h, 000159c5h, 0ffe4ba9dh
+ DWORD 0ffec09f7h, 0ffea2c62h, 0014f609h, 0016d39eh
+ DWORD 00000282h, 0000cbddh, 0000fd7eh, 00003423h
+ DWORD 0000a113h, 000015d4h, 00005eedh, 0000ea2ch
+ DWORD 0000063bh, 0000a563h, 0000f9c5h, 00005a9dh
+ DWORD 0000e9f7h, 00006c62h, 00001609h, 0000939eh
+ DWORD 0ffccfbe9h, 0000ad00h, 00340417h, 00005300h
+ DWORD 00040af0h, 0ffef36beh, 0fffcf510h, 0011c942h
+ DWORD 0007c417h, 000dcd44h, 0fff93be9h, 0fff332bch
+ DWORD 002f4588h, 003c675ah, 0ffd1ba78h, 0ffc498a6h
+ DWORD 00001be9h, 0000ad00h, 0000e417h, 00005300h
+ DWORD 00000af0h, 0000f6beh, 0000f510h, 00000942h
+ DWORD 0000a417h, 00004d44h, 00005be9h, 0000b2bch
+ DWORD 00004588h, 0000a75ah, 0000ba78h, 000058a6h
+ DWORD 0ffc72bcah, 0024756ch, 0039d436h, 0ffdc8a94h
+ DWORD 0ffffde7eh, 0fffcc7dfh, 00012182h, 00043821h
+ DWORD 00193948h, 000b98a1h, 0ffe7c6b8h, 0fff5675fh
+ DWORD 0ffce69c0h, 0ffebe808h, 00329640h, 001517f8h
+ DWORD 00006bcah, 0000f56ch, 00009436h, 00000a94h
+ DWORD 00009e7eh, 0000a7dfh, 00006182h, 00005821h
+ DWORD 00003948h, 0000b8a1h, 0000c6b8h, 0000475fh
+ DWORD 000069c0h, 0000e808h, 00009640h, 000017f8h
+ DWORD 0002e46ch, 0ffdb3c93h, 0fffe1b94h, 0025c36dh
+ DWORD 0ffc9c808h, 0fffd4ae0h, 003737f8h, 0003b520h
+ DWORD 003036c2h, 00141305h, 0ffd0c93eh, 0ffececfbh
+ DWORD 0ffe3bff6h, 00147792h, 001d400ah, 0ffec886eh
+ DWORD 0000646ch, 00009c93h, 00009b94h, 0000636dh
+ DWORD 0000c808h, 00004ae0h, 000037f8h, 0000b520h
+ DWORD 000076c2h, 0000b305h, 0000893eh, 00004cfbh
+ DWORD 00007ff6h, 0000b792h, 0000800ah, 0000486eh
+ DWORD 00139e25h, 0ffd1eea2h, 0ffed61dbh, 002f115eh
+ DWORD 0ffe7d0e0h, 0ffc4c79ch, 00192f20h, 003c3864h
+ DWORD 0fff39944h, 0ffc8a057h, 000d66bch, 00385fa9h
+ DWORD 0ffea0802h, 003a97d9h, 0016f7feh, 0ffc66827h
+ DWORD 00003e25h, 00002ea2h, 0000c1dbh, 0000d15eh
+ DWORD 0000d0e0h, 0000479ch, 00002f20h, 0000b864h
+ DWORD 00001944h, 00008057h, 0000e6bch, 00007fa9h
+ DWORD 00004802h, 0000b7d9h, 0000b7feh, 00004827h
+ptr_L_mldsa_avx2_zetas_basemul QWORD L_mldsa_avx2_zetas_basemul
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_avx2_zetas_1 DWORD 0ffc97e01h, 0ffc97e01h, 0ffc97e01h, 0ffc97e01h
+ DWORD 0ffc97e01h, 0ffc97e01h, 0ffc97e01h, 0ffc97e01h
+ptr_L_mldsa_avx2_zetas_1 QWORD L_mldsa_avx2_zetas_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_avx2_zetas_inv DWORD 0ffe1d632h, 000ce94ah, 0ffeaa198h, 0ffc3ea36h
+ DWORD 0014c921h, 0000bcb2h, 0ffc430d4h, 000875b0h
+ DWORD 0e9a81632h, 2036294ah, 0e81da198h, 3a0aaa36h
+ DWORD 8cb8e921h, 8696fcb2h, 6bdeb0d4h, 0f6be75b0h
+ DWORD 00361a57h, 0ffe763d6h, 000ee7fbh, 0fffd336dh
+ DWORD 0022e2f7h, 00066c23h, 00221e51h, 002cd89ch
+ DWORD 0f400fa57h, 596223d6h, 0cb8e47fbh, 0a3ead36dh
+ DWORD 0bd01c2f7h, 480acc23h, 5f6c3e51h, 0bd40589ch
+ DWORD 0ffc56827h, 0ffc56827h, 00375fa9h, 00375fa9h
+ DWORD 003b3864h, 003b3864h, 002e115eh, 002e115eh
+ DWORD 354a4827h, 354a4827h, 0bbac7fa9h, 0bbac7fa9h
+ DWORD 0c547b864h, 0c547b864h, 8b59d15eh, 8b59d15eh
+ DWORD 0015f7feh, 0015f7feh, 000c66bch, 000c66bch
+ DWORD 00182f20h, 00182f20h, 0ffec61dbh, 0ffec61dbh
+ DWORD 0b815b7feh, 0b815b7feh, 1ee3e6bch, 1ee3e6bch
+ DWORD 0f5fc2f20h, 0f5fc2f20h, 0aa7c1dbh, 0aa7c1dbh
+ DWORD 002ab0d3h, 002ab0d3h, 002ab0d3h, 002ab0d3h
+ DWORD 0ffe062ach, 0ffe062ach, 0ffe062ach, 0ffe062ach
+ DWORD 38c510d3h, 38c510d3h, 38c510d3h, 38c510d3h
+ DWORD 6635e2ach, 6635e2ach, 6635e2ach, 6635e2ach
+ DWORD 002703d0h, 002703d0h, 002703d0h, 002703d0h
+ DWORD 0ffc47acch, 0ffc47acch, 0ffc47acch, 0ffc47acch
+ DWORD 38a103d0h, 38a103d0h, 38a103d0h, 38a103d0h
+ DWORD 591dfacch, 591dfacch, 591dfacch, 591dfacch
+ DWORD 0fffbba3bh, 0fffbba3bh, 0fffbba3bh, 0fffbba3bh
+ DWORD 0fffbba3bh, 0fffbba3bh, 0fffbba3bh, 0fffbba3bh
+ DWORD 45c31a3bh, 45c31a3bh, 45c31a3bh, 45c31a3bh
+ DWORD 45c31a3bh, 45c31a3bh, 45c31a3bh, 45c31a3bh
+ DWORD 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh
+ DWORD 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh
+ DWORD 5602adffh, 5602adffh, 5602adffh, 5602adffh
+ DWORD 5602adffh, 5602adffh, 5602adffh, 5602adffh
+ DWORD 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h
+ DWORD 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h
+ DWORD 0dab23ad9h, 0dab23ad9h, 0dab23ad9h, 0dab23ad9h
+ DWORD 0dab23ad9h, 0dab23ad9h, 0dab23ad9h, 0dab23ad9h
+ DWORD 003a8025h, 0003fa26h, 0010d9cdh, 00197168h
+ DWORD 0ffe2d998h, 001b8352h, 0ffe5b330h, 000682bbh
+ DWORD 0d1bf2025h, 0448ba26h, 68ca79cdh, 1a467168h
+ DWORD 0ef15d998h, 0f85c352h, 0de4bb330h, 5edde2bbh
+ DWORD 0fff8cbf2h, 0ffe5ca19h, 000ee40ch, 0ffde3b09h
+ DWORD 0fffbe240h, 0ffc4feebh, 0ffca3ac7h, 002e5ec4h
+ DWORD 0e8770bf2h, 10a8ea19h, 0690640ch, 66bf5b09h
+ DWORD 5c43e240h, 0d6225eebh, 0ffa31ac7h, 7a06dec4h
+ DWORD 0ffeb886eh, 0ffeb886eh, 0ffebecfbh, 0ffebecfbh
+ DWORD 0002b520h, 0002b520h, 0024c36dh, 0024c36dh
+ DWORD 0f1f9486eh, 0f1f9486eh, 0ec0b4cfbh, 0ec0b4cfbh
+ DWORD 46a6b520h, 46a6b520h, 9612636dh, 9612636dh
+ DWORD 001c400ah, 001c400ah, 0ffcfc93eh, 0ffcfc93eh
+ DWORD 003637f8h, 003637f8h, 0fffd1b94h, 0fffd1b94h
+ DWORD 0ab1d800ah, 0ab1d800ah, 51f7893eh, 51f7893eh
+ DWORD 0ab3537f8h, 0ab3537f8h, 296f9b94h, 296f9b94h
+ DWORD 0ffc4c7adh, 0ffc4c7adh, 0ffc4c7adh, 0ffc4c7adh
+ DWORD 0fff1708ah, 0fff1708ah, 0fff1708ah, 0fff1708ah
+ DWORD 763a67adh, 763a67adh, 763a67adh, 763a67adh
+ DWORD 1102b08ah, 1102b08ah, 1102b08ah, 1102b08ah
+ DWORD 0fff7e466h, 0fff7e466h, 0fff7e466h, 0fff7e466h
+ DWORD 0fff39c58h, 0fff39c58h, 0fff39c58h, 0fff39c58h
+ DWORD 6184a466h, 6184a466h, 6184a466h, 6184a466h
+ DWORD 0a77e9c58h, 0a77e9c58h, 0a77e9c58h, 0a77e9c58h
+ DWORD 00004bdeh, 00004bdeh, 00004bdeh, 00004bdeh
+ DWORD 00004bdeh, 00004bdeh, 00004bdeh, 00004bdeh
+ DWORD 927c0bdeh, 927c0bdeh, 927c0bdeh, 927c0bdeh
+ DWORD 927c0bdeh, 927c0bdeh, 927c0bdeh, 927c0bdeh
+ DWORD 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh
+ DWORD 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh
+ DWORD 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh
+ DWORD 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh
+ DWORD 0ffd085b7h, 0ffd085b7h, 0ffd085b7h, 0ffd085b7h
+ DWORD 0ffd085b7h, 0ffd085b7h, 0ffd085b7h, 0ffd085b7h
+ DWORD 110765b7h, 110765b7h, 110765b7h, 110765b7h
+ DWORD 110765b7h, 110765b7h, 110765b7h, 110765b7h
+ DWORD 0fff8e1dch, 0fff8e1dch, 0fff8e1dch, 0fff8e1dch
+ DWORD 0fff8e1dch, 0fff8e1dch, 0fff8e1dch, 0fff8e1dch
+ DWORD 9e3461dch, 9e3461dch, 9e3461dch, 9e3461dch
+ DWORD 9e3461dch, 9e3461dch, 9e3461dch, 9e3461dch
+ DWORD 001b73c3h, 00385e99h, 0ffe6c866h, 0fff3a35bh
+ DWORD 0ffd2028fh, 0ffe755f8h, 0fffd83e4h, 00309342h
+ DWORD 9913d3c3h, 238b7e99h, 3df38866h, 32df035bh
+ DWORD 34a3e28fh, 0cea655f8h, 4e7a03e4h, 7998d342h
+ DWORD 00126c59h, 0fffd2b45h, 0ffcc9768h, 0ffe22268h
+ DWORD 00028371h, 0ffda8c49h, 0ffdae275h, 001eb9a8h
+ DWORD 851d8c59h, 16e5cb45h, 7eb99768h, 0b02f2268h
+ DWORD 5bf0a371h, 50e3ac49h, 0f5a98275h, 0a353b9a8h
+ DWORD 001417f8h, 001417f8h, 0fff4675fh, 0fff4675fh
+ DWORD 00033821h, 00033821h, 0ffdb8a94h, 0ffdb8a94h
+ DWORD 671317f8h, 671317f8h, 5960475fh, 5960475fh
+ DWORD 0da875821h, 0da875821h, 772e0a94h, 772e0a94h
+ DWORD 00319640h, 00319640h, 0ffe6c6b8h, 0ffe6c6b8h
+ DWORD 00002182h, 00002182h, 0038d436h, 0038d436h
+ DWORD 12f99640h, 12f99640h, 5cbdc6b8h, 5cbdc6b8h
+ DWORD 4b306182h, 4b306182h, 0d7bf9436h, 0d7bf9436h
+ DWORD 0ffc78c74h, 0ffc78c74h, 0ffc78c74h, 0ffc78c74h
+ DWORD 00186ba4h, 00186ba4h, 00186ba4h, 00186ba4h
+ DWORD 87560c74h, 87560c74h, 87560c74h, 87560c74h
+ DWORD 0cb8ceba4h, 0cb8ceba4h, 0cb8ceba4h, 0cb8ceba4h
+ DWORD 0020a9e9h, 0020a9e9h, 0020a9e9h, 0020a9e9h
+ DWORD 0ffca7bc1h, 0ffca7bc1h, 0ffca7bc1h, 0ffca7bc1h
+ DWORD 0c4ddc9e9h, 0c4ddc9e9h, 0c4ddc9e9h, 0c4ddc9e9h
+ DWORD 72c29bc1h, 72c29bc1h, 72c29bc1h, 72c29bc1h
+ DWORD 00320368h, 00320368h, 00320368h, 00320368h
+ DWORD 00320368h, 00320368h, 00320368h, 00320368h
+ DWORD 2c9f0368h, 2c9f0368h, 2c9f0368h, 2c9f0368h
+ DWORD 2c9f0368h, 2c9f0368h, 2c9f0368h, 2c9f0368h
+ DWORD 00155b09h, 00155b09h, 00155b09h, 00155b09h
+ DWORD 00155b09h, 00155b09h, 00155b09h, 00155b09h
+ DWORD 4af67b09h, 4af67b09h, 4af67b09h, 4af67b09h
+ DWORD 4af67b09h, 4af67b09h, 4af67b09h, 4af67b09h
+ DWORD 002c04f7h, 002c04f7h, 002c04f7h, 002c04f7h
+ DWORD 002c04f7h, 002c04f7h, 002c04f7h, 002c04f7h
+ DWORD 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h
+ DWORD 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h
+ DWORD 0039a1e1h, 0fff6ef28h, 0038d3eeh, 00276ee5h
+ DWORD 001c2ea9h, 00198008h, 002b35f4h, 000846cch
+ DWORD 0c7f5c1e1h, 0e9dbef28h, 0dbb693eeh, 0f840ee5h
+ DWORD 0d5714ea9h, 4c1a8008h, 3ce9b5f4h, 0d2e1c6cch
+ DWORD 0ffcc0731h, 0ffdde218h, 0fff42419h, 0ffe91bfbh
+ DWORD 0014fa53h, 0026da88h, 0ffe2bf67h, 001386adh
+ DWORD 0ac322731h, 1020e218h, 0dbf74419h, 11e87bfbh
+ DWORD 0c1df5a53h, 0b777da88h, 404f9f67h, 0ce6926adh
+ DWORD 0ffc398a6h, 0ffc398a6h, 0fff232bch, 0fff232bch
+ DWORD 0010c942h, 0010c942h, 0ffff5300h, 0ffff5300h
+ DWORD 0b7d858a6h, 0b7d858a6h, 0d849b2bch, 0d849b2bch
+ DWORD 80390942h, 80390942h, 6a5f5300h, 6a5f5300h
+ DWORD 0ffd0ba78h, 0ffd0ba78h, 0fff83be9h, 0fff83be9h
+ DWORD 0fffbf510h, 0fffbf510h, 00330417h, 00330417h
+ DWORD 0bb1fba78h, 0bb1fba78h, 0b6f55be9h, 0b6f55be9h
+ DWORD 369df510h, 369df510h, 0b135e417h, 0b135e417h
+ DWORD 003a50a7h, 003a50a7h, 003a50a7h, 003a50a7h
+ DWORD 0ffca81e2h, 0ffca81e2h, 0ffca81e2h, 0ffca81e2h
+ DWORD 92cf30a7h, 92cf30a7h, 92cf30a7h, 92cf30a7h
+ DWORD 0e706c1e2h, 0e706c1e2h, 0e706c1e2h, 0e706c1e2h
+ DWORD 0019152ah, 0019152ah, 0019152ah, 0019152ah
+ DWORD 0019edc3h, 0019edc3h, 0019edc3h, 0019edc3h
+ DWORD 35be552ah, 35be552ah, 35be552ah, 35be552ah
+ DWORD 68524dc3h, 68524dc3h, 68524dc3h, 68524dc3h
+ DWORD 003ae519h, 003ae519h, 003ae519h, 003ae519h
+ DWORD 003ae519h, 003ae519h, 003ae519h, 003ae519h
+ DWORD 345e0519h, 345e0519h, 345e0519h, 345e0519h
+ DWORD 345e0519h, 345e0519h, 345e0519h, 345e0519h
+ DWORD 0020522ah, 0020522ah, 0020522ah, 0020522ah
+ DWORD 0020522ah, 0020522ah, 0020522ah, 0020522ah
+ DWORD 9d65922ah, 9d65922ah, 9d65922ah, 9d65922ah
+ DWORD 9d65922ah, 9d65922ah, 9d65922ah, 9d65922ah
+ DWORD 0ffd0658bh, 0ffd0658bh, 0ffd0658bh, 0ffd0658bh
+ DWORD 0ffd0658bh, 0ffd0658bh, 0ffd0658bh, 0ffd0658bh
+ DWORD 7301c58bh, 7301c58bh, 7301c58bh, 7301c58bh
+ DWORD 7301c58bh, 7301c58bh, 7301c58bh, 7301c58bh
+ DWORD 000d5ed8h, 000d5ed8h, 000d5ed8h, 000d5ed8h
+ DWORD 000d5ed8h, 000d5ed8h, 000d5ed8h, 000d5ed8h
+ DWORD 9fe85ed8h, 9fe85ed8h, 9fe85ed8h, 9fe85ed8h
+ DWORD 9fe85ed8h, 9fe85ed8h, 9fe85ed8h, 9fe85ed8h
+ DWORD 001df292h, 0ffcd8d7dh, 0ffebf939h, 0006e21ch
+ DWORD 0015d2d1h, 0032a1c2h, 0ffed1ee5h, 00145742h
+ DWORD 0bd703292h, 0e6fd2d7dh, 0c6931939h, 3e4a621ch
+ DWORD 95eff2d1h, 7b6ae1c2h, 0c549bee5h, 0f1fc9742h
+ DWORD 0010095ah, 0ffe2f4b5h, 0ffe37ac1h, 002daf77h
+ DWORD 00362470h, 0ffd7c76fh, 0ffeceb42h, 00397ae8h
+ DWORD 0bc3b495ah, 0d7f994b5h, 12bb9ac1h, 0d69c8f77h
+ DWORD 4cc42470h, 0fd45a76fh, 04552b42h, 5b967ae8h
+ DWORD 0015d39eh, 0015d39eh, 0ffe3ba9dh, 0ffe3ba9dh
+ DWORD 0ffeb6a2ch, 0ffeb6a2ch, 0005d423h, 0005d423h
+ DWORD 6389939eh, 6389939eh, 9cb75a9dh, 9cb75a9dh
+ DWORD 0730ea2ch, 0730ea2ch, 350a3423h, 350a3423h
+ DWORD 0013f609h, 0013f609h, 000059c5h, 000059c5h
+ DWORD 0012beedh, 0012beedh, 000a3d7eh, 000a3d7eh
+ DWORD 9e551609h, 9e551609h, 3cb8f9c5h, 3cb8f9c5h
+ DWORD 95705eedh, 95705eedh, 80b9fd7eh, 80b9fd7eh
+ DWORD 00083aa3h, 00083aa3h, 00083aa3h, 00083aa3h
+ DWORD 0ffdc2964h, 0ffdc2964h, 0ffdc2964h, 0ffdc2964h
+ DWORD 41dc9aa3h, 41dc9aa3h, 41dc9aa3h, 41dc9aa3h
+ DWORD 6308a964h, 6308a964h, 6308a964h, 6308a964h
+ DWORD 000495b3h, 000495b3h, 000495b3h, 000495b3h
+ DWORD 0ffc9fc00h, 0ffc9fc00h, 0ffc9fc00h, 0ffc9fc00h
+ DWORD 853af5b3h, 853af5b3h, 853af5b3h, 853af5b3h
+ DWORD 3f49fc00h, 3f49fc00h, 3f49fc00h, 3f49fc00h
+ DWORD 00202c85h, 00202c85h, 00202c85h, 00202c85h
+ DWORD 00202c85h, 00202c85h, 00202c85h, 00202c85h
+ DWORD 0d730cc85h, 0d730cc85h, 0d730cc85h, 0d730cc85h
+ DWORD 0d730cc85h, 0d730cc85h, 0d730cc85h, 0d730cc85h
+ DWORD 0ffd80698h, 0ffd80698h, 0ffd80698h, 0ffd80698h
+ DWORD 0ffd80698h, 0ffd80698h, 0ffd80698h, 0ffd80698h
+ DWORD 14ab0698h, 14ab0698h, 14ab0698h, 14ab0698h
+ DWORD 14ab0698h, 14ab0698h, 14ab0698h, 14ab0698h
+ DWORD 001feb81h, 001feb81h, 001feb81h, 001feb81h
+ DWORD 001feb81h, 001feb81h, 001feb81h, 001feb81h
+ DWORD 41100b81h, 41100b81h, 41100b81h, 41100b81h
+ DWORD 41100b81h, 41100b81h, 41100b81h, 41100b81h
+ DWORD 0ffe7a5bah, 0ffda0fafh, 0ffecf67ch, 0ffc21ee4h
+ DWORD 0ffecb28fh, 002785c6h, 0ffd6ee67h, 0ffd4a11bh
+ DWORD 0ff9ee5bah, 264fefafh, 50bc767ch, 619e9ee4h
+ DWORD 8abe928fh, 25e045c6h, 4623ce67h, 7278011bh
+ DWORD 0fffce6dch, 0ffe58339h, 0032ffc5h, 0ffcb8d19h
+ DWORD 0ffc14fe5h, 002532bfh, 0fffb9ef4h, 0fffac6e7h
+ DWORD 9ed866dch, 0f7cca339h, 91ab9fc5h, 48eead19h
+ DWORD 0cb3defe5h, 42fd12bfh, 0c9da1ef4h, 8157a6e7h
+ DWORD 0025cbf7h, 0025cbf7h, 00064593h, 00064593h
+ DWORD 00385bb5h, 00385bb5h, 002d485dh, 002d485dh
+ DWORD 9a24abf7h, 9a24abf7h, 4b38a593h, 4b38a593h
+ DWORD 052efbb5h, 052efbb5h, 0eeb8e85dh, 0eeb8e85dh
+ DWORD 0ffd69161h, 0ffd69161h, 0ffdf39c8h, 0ffdf39c8h
+ DWORD 000f017bh, 000f017bh, 0ffcbef0eh, 0ffcbef0eh
+ DWORD 0a582b161h, 0a582b161h, 231839c8h, 231839c8h
+ DWORD 0ebe617bh, 0ebe617bh, 2eadaf0eh, 2eadaf0eh
+ DWORD 002bc1bfh, 002bc1bfh, 002bc1bfh, 002bc1bfh
+ DWORD 0ffc9756ah, 0ffc9756ah, 0ffc9756ah, 0ffc9756ah
+ DWORD 94e3a1bfh, 94e3a1bfh, 94e3a1bfh, 94e3a1bfh
+ DWORD 2176b56ah, 2176b56ah, 2176b56ah, 2176b56ah
+ DWORD 002e7184h, 002e7184h, 002e7184h, 002e7184h
+ DWORD 003aea7bh, 003aea7bh, 003aea7bh, 003aea7bh
+ DWORD 1c5ef184h, 1c5ef184h, 1c5ef184h, 1c5ef184h
+ DWORD 0c0a4a7bh, 0c0a4a7bh, 0c0a4a7bh, 0c0a4a7bh
+ DWORD 00111560h, 00111560h, 00111560h, 00111560h
+ DWORD 00111560h, 00111560h, 00111560h, 00111560h
+ DWORD 0f2bd1560h, 0f2bd1560h, 0f2bd1560h, 0f2bd1560h
+ DWORD 0f2bd1560h, 0f2bd1560h, 0f2bd1560h, 0f2bd1560h
+ DWORD 00086270h, 00086270h, 00086270h, 00086270h
+ DWORD 00086270h, 00086270h, 00086270h, 00086270h
+ DWORD 94566270h, 94566270h, 94566270h, 94566270h
+ DWORD 94566270h, 94566270h, 94566270h, 94566270h
+ DWORD 00057b53h, 00057b53h, 00057b53h, 00057b53h
+ DWORD 00057b53h, 00057b53h, 00057b53h, 00057b53h
+ DWORD 51efdb53h, 51efdb53h, 51efdb53h, 51efdb53h
+ DWORD 51efdb53h, 51efdb53h, 51efdb53h, 51efdb53h
+ DWORD 000bdee8h, 000bdee8h, 000bdee8h, 000bdee8h
+ DWORD 000bdee8h, 000bdee8h, 000bdee8h, 000bdee8h
+ DWORD 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h
+ DWORD 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h
+ DWORD 0036de3eh, 000bba6eh, 0008032ah, 00364683h
+ DWORD 0ffcf107ah, 0ffe0ff7ch, 002fa50ah, 0009ffdfh
+ DWORD 0b4fe9e3eh, 0f8597a6eh, 136d432ah, 9386a683h
+ DWORD 8cde507ah, 51d07f7ch, 97d0e50ah, 0cc85dfdfh
+ DWORD 0007f904h, 0000a8fch, 00189d76h, 0fff8707dh
+ DWORD 0fff380a6h, 0fff21f1ah, 0ffe3a1e6h, 0fff241a2h
+ DWORD 8d287904h, 872028fch, 30c75d76h, 0c388107dh
+ DWORD 0b50840a6h, 1ed55f1ah, 192061e6h, 0ff2681a2h
+ DWORD 0fffe1036h, 0fffe1036h, 00376f20h, 00376f20h
+ DWORD 00302d52h, 00302d52h, 0030ad80h, 0030ad80h
+ DWORD 7f04d036h, 7f04d036h, 0de1b6f20h, 0de1b6f20h
+ DWORD 0a4da6d52h, 0a4da6d52h, 55e0ad80h, 55e0ad80h
+ DWORD 000f430ah, 000f430ah, 003e4f8eh, 003e4f8eh
+ DWORD 0ffe2688eh, 0ffe2688eh, 0013308bh, 0013308bh
+ DWORD 8b70830ah, 8b70830ah, 3b300f8eh, 3b300f8eh
+ DWORD 3df4288eh, 3df4288eh, 4ca4908bh, 4ca4908bh
+ DWORD 0ffc44151h, 0ffc44151h, 0ffc44151h, 0ffc44151h
+ DWORD 0026b82ch, 0026b82ch, 0026b82ch, 0026b82ch
+ DWORD 236e6151h, 236e6151h, 236e6151h, 236e6151h
+ DWORD 712c382ch, 712c382ch, 712c382ch, 712c382ch
+ DWORD 0036cfd4h, 0036cfd4h, 0036cfd4h, 0036cfd4h
+ DWORD 00195afdh, 00195afdh, 00195afdh, 00195afdh
+ DWORD 40314fd4h, 40314fd4h, 40314fd4h, 40314fd4h
+ DWORD 0a0f8fafdh, 0a0f8fafdh, 0a0f8fafdh, 0a0f8fafdh
+ DWORD 0ffc94878h, 0ffc94878h, 0ffc94878h, 0ffc94878h
+ DWORD 0ffc94878h, 0ffc94878h, 0ffc94878h, 0ffc94878h
+ DWORD 0ccd84878h, 0ccd84878h, 0ccd84878h, 0ccd84878h
+ DWORD 0ccd84878h, 0ccd84878h, 0ccd84878h, 0ccd84878h
+ DWORD 00107a5ch, 00107a5ch, 00107a5ch, 00107a5ch
+ DWORD 00107a5ch, 00107a5ch, 00107a5ch, 00107a5ch
+ DWORD 515bfa5ch, 515bfa5ch, 515bfa5ch, 515bfa5ch
+ DWORD 515bfa5ch, 515bfa5ch, 515bfa5ch, 515bfa5ch
+ DWORD 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h
+ DWORD 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h
+ DWORD 6c36b6d5h, 6c36b6d5h, 6c36b6d5h, 6c36b6d5h
+ DWORD 6c36b6d5h, 6c36b6d5h, 6c36b6d5h, 6c36b6d5h
+ DWORD 0030ba22h, 001244aah, 00395d04h, 0035b760h
+ DWORD 0ffca64a3h, 0012db10h, 0ffdada79h, 0fffbed0bh
+ DWORD 8e74fa22h, 9ba784aah, 0b9d9dd04h, 8721b760h
+ DWORD 86dec4a3h, 1374db10h, 02a9fa79h, 241d4d0bh
+ DWORD 00365bdeh, 00255461h, 0ffddc205h, 0033008eh
+ DWORD 0ffc5be08h, 0ffdca72ch, 0ffcc00a6h, 0ffe0156dh
+ DWORD 54b21bdeh, 0fe317461h, 0c99e6205h, 5144c08eh
+ DWORD 0d386be08h, 0aec2272ch, 0c4e0c0a6h, 000db56dh
+ DWORD 00183045h, 00183045h, 0ffdeca39h, 0ffdeca39h
+ DWORD 0ffcaf612h, 0ffcaf612h, 001629a3h, 001629a3h
+ DWORD 0f7a0d045h, 0f7a0d045h, 0a0a5ea39h, 0a0a5ea39h
+ DWORD 9d8d3612h, 9d8d3612h, 7fca89a3h, 7fca89a3h
+ DWORD 002e67e7h, 002e67e7h, 00381e31h, 00381e31h
+ DWORD 0017537fh, 0017537fh, 003bf91bh, 003bf91bh
+ DWORD 75ab47e7h, 75ab47e7h, 0af7e3e31h, 0af7e3e31h
+ DWORD 2707337fh, 2707337fh, 5ddf591bh, 5ddf591bh
+ DWORD 0ffca213bh, 0ffca213bh, 0ffca213bh, 0ffca213bh
+ DWORD 0ffd10b33h, 0ffd10b33h, 0ffd10b33h, 0ffd10b33h
+ DWORD 9271813bh, 9271813bh, 9271813bh, 9271813bh
+ DWORD 53b76b33h, 53b76b33h, 53b76b33h, 53b76b33h
+ DWORD 0fffe89e0h, 0fffe89e0h, 0fffe89e0h, 0fffe89e0h
+ DWORD 0ffd6b599h, 0ffd6b599h, 0ffd6b599h, 0ffd6b599h
+ DWORD 613a89e0h, 613a89e0h, 613a89e0h, 613a89e0h
+ DWORD 6e09d599h, 6e09d599h, 6e09d599h, 6e09d599h
+ DWORD 0fff05f90h, 0fff05f90h, 0fff05f90h, 0fff05f90h
+ DWORD 0fff05f90h, 0fff05f90h, 0fff05f90h, 0fff05f90h
+ DWORD 83e25f90h, 83e25f90h, 83e25f90h, 83e25f90h
+ DWORD 83e25f90h, 83e25f90h, 83e25f90h, 83e25f90h
+ DWORD 0ffd669a8h, 0ffd669a8h, 0ffd669a8h, 0ffd669a8h
+ DWORD 0ffd669a8h, 0ffd669a8h, 0ffd669a8h, 0ffd669a8h
+ DWORD 990b69a8h, 990b69a8h, 990b69a8h, 990b69a8h
+ DWORD 990b69a8h, 990b69a8h, 990b69a8h, 990b69a8h
+ DWORD 0ffe421d5h, 0ffe421d5h, 0ffe421d5h, 0ffe421d5h
+ DWORD 0ffe421d5h, 0ffe421d5h, 0ffe421d5h, 0ffe421d5h
+ DWORD 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h
+ DWORD 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h
+ DWORD 0fffc61bch, 0fffc61bch, 0fffc61bch, 0fffc61bch
+ DWORD 0fffc61bch, 0fffc61bch, 0fffc61bch, 0fffc61bch
+ DWORD 9e33e1bch, 9e33e1bch, 9e33e1bch, 9e33e1bch
+ DWORD 9e33e1bch, 9e33e1bch, 9e33e1bch, 9e33e1bch
+ DWORD 0007eafdh, 0007eafdh, 0007eafdh, 0007eafdh
+ DWORD 0007eafdh, 0007eafdh, 0007eafdh, 0007eafdh
+ DWORD 72e78afdh, 72e78afdh, 72e78afdh, 72e78afdh
+ DWORD 72e78afdh, 72e78afdh, 72e78afdh, 72e78afdh
+ DWORD 0027cefeh, 0027cefeh, 0027cefeh, 0027cefeh
+ DWORD 0027cefeh, 0027cefeh, 0027cefeh, 0027cefeh
+ DWORD 73078efeh, 73078efeh, 73078efeh, 73078efeh
+ DWORD 73078efeh, 73078efeh, 73078efeh, 73078efeh
+ DWORD 0ffff9b09h, 0ffff9b09h, 0ffff9b09h, 0ffff9b09h
+ DWORD 0ffff9b09h, 0ffff9b09h, 0ffff9b09h, 0ffff9b09h
+ DWORD 92e0bb09h, 92e0bb09h, 92e0bb09h, 92e0bb09h
+ DWORD 92e0bb09h, 92e0bb09h, 92e0bb09h, 92e0bb09h
+ DWORD 0000a3fah, 0000a3fah, 0000a3fah, 0000a3fah
+ DWORD 0000a3fah, 0000a3fah, 0000a3fah, 0000a3fah
+ DWORD 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah
+ DWORD 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah
+ptr_L_mldsa_avx2_zetas_inv QWORD L_mldsa_avx2_zetas_inv
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_poly_red_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vpxor ymm10, ymm10, ymm10
+ vmovdqu ymm10, YMMWORD PTR mldsa_q
+ vmovdqu ymm11, YMMWORD PTR mldsa_v
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpaddd ymm8, ymm0, ymm11
+ vpaddd ymm9, ymm1, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpaddd ymm8, ymm2, ymm11
+ vpaddd ymm9, ymm3, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm2, ymm2, ymm8
+ vpsubd ymm3, ymm3, ymm9
+ vpaddd ymm8, ymm4, ymm11
+ vpaddd ymm9, ymm5, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm4, ymm4, ymm8
+ vpsubd ymm5, ymm5, ymm9
+ vpaddd ymm8, ymm6, ymm11
+ vpaddd ymm9, ymm7, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm6, ymm6, ymm8
+ vpsubd ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpaddd ymm8, ymm0, ymm11
+ vpaddd ymm9, ymm1, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpaddd ymm8, ymm2, ymm11
+ vpaddd ymm9, ymm3, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm2, ymm2, ymm8
+ vpsubd ymm3, ymm3, ymm9
+ vpaddd ymm8, ymm4, ymm11
+ vpaddd ymm9, ymm5, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm4, ymm4, ymm8
+ vpsubd ymm5, ymm5, ymm9
+ vpaddd ymm8, ymm6, ymm11
+ vpaddd ymm9, ymm7, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm6, ymm6, ymm8
+ vpsubd ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vpaddd ymm8, ymm0, ymm11
+ vpaddd ymm9, ymm1, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpaddd ymm8, ymm2, ymm11
+ vpaddd ymm9, ymm3, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm2, ymm2, ymm8
+ vpsubd ymm3, ymm3, ymm9
+ vpaddd ymm8, ymm4, ymm11
+ vpaddd ymm9, ymm5, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm4, ymm4, ymm8
+ vpsubd ymm5, ymm5, ymm9
+ vpaddd ymm8, ymm6, ymm11
+ vpaddd ymm9, ymm7, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm6, ymm6, ymm8
+ vpsubd ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpaddd ymm8, ymm0, ymm11
+ vpaddd ymm9, ymm1, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpaddd ymm8, ymm2, ymm11
+ vpaddd ymm9, ymm3, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm2, ymm2, ymm8
+ vpsubd ymm3, ymm3, ymm9
+ vpaddd ymm8, ymm4, ymm11
+ vpaddd ymm9, ymm5, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm4, ymm4, ymm8
+ vpsubd ymm5, ymm5, ymm9
+ vpaddd ymm8, ymm6, ymm11
+ vpaddd ymm9, ymm7, ymm11
+ vpsrad ymm8, ymm8, 23
+ vpsrad ymm9, ymm9, 23
+ vpmulld ymm8, ymm8, ymm10
+ vpmulld ymm9, ymm9, ymm10
+ vpsubd ymm6, ymm6, ymm8
+ vpsubd ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+wc_mldsa_poly_red_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_ntt_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vpxor ymm14, ymm14, ymm14
+ vmovdqu ymm14, YMMWORD PTR mldsa_q
+ ; ntt
+ mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas]
+ vmovdqu ymm11, YMMWORD PTR [rdx+64]
+ vmovdqu ymm13, YMMWORD PTR [rdx+96]
+ ; 128: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+96]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vmovdqu ymm2, YMMWORD PTR [rcx+352]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+608]
+ vmovdqu ymm5, YMMWORD PTR [rcx+736]
+ vmovdqu ymm6, YMMWORD PTR [rcx+864]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+224], ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vmovdqu YMMWORD PTR [rcx+608], ymm4
+ vmovdqu YMMWORD PTR [rcx+736], ymm5
+ vmovdqu YMMWORD PTR [rcx+864], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ ; 128: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+192]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+448]
+ vmovdqu ymm4, YMMWORD PTR [rcx+576]
+ vmovdqu ymm5, YMMWORD PTR [rcx+704]
+ vmovdqu ymm6, YMMWORD PTR [rcx+832]
+ vmovdqu ymm7, YMMWORD PTR [rcx+960]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+576], ymm4
+ vmovdqu YMMWORD PTR [rcx+704], ymm5
+ vmovdqu YMMWORD PTR [rcx+832], ymm6
+ vmovdqu YMMWORD PTR [rcx+960], ymm7
+ ; 128: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+32]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+288]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vmovdqu ymm4, YMMWORD PTR [rcx+544]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+800]
+ vmovdqu ymm7, YMMWORD PTR [rcx+928]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ vmovdqu YMMWORD PTR [rcx+416], ymm3
+ vmovdqu YMMWORD PTR [rcx+544], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+800], ymm6
+ vmovdqu YMMWORD PTR [rcx+928], ymm7
+ ; 128: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+128]
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+384]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+640]
+ vmovdqu ymm6, YMMWORD PTR [rcx+768]
+ vmovdqu ymm7, YMMWORD PTR [rcx+896]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+256], ymm2
+ vmovdqu YMMWORD PTR [rcx+384], ymm3
+ vmovdqu YMMWORD PTR [rcx+512], ymm4
+ vmovdqu YMMWORD PTR [rcx+640], ymm5
+ vmovdqu YMMWORD PTR [rcx+768], ymm6
+ vmovdqu YMMWORD PTR [rcx+896], ymm7
+ vmovdqu ymm4, ymm1
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ ; 32: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+192]
+ vmovdqu ymm12, YMMWORD PTR [rdx+224]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+256]
+ vmovdqu ymm12, YMMWORD PTR [rdx+288]
+ vmovdqu ymm11, YMMWORD PTR [rdx+320]
+ vmovdqu ymm13, YMMWORD PTR [rdx+352]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+384]
+ vmovdqu ymm12, YMMWORD PTR [rdx+416]
+ vmovdqu ymm11, YMMWORD PTR [rdx+448]
+ vmovdqu ymm13, YMMWORD PTR [rdx+480]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+512]
+ vmovdqu ymm12, YMMWORD PTR [rdx+544]
+ vmovdqu ymm11, YMMWORD PTR [rdx+576]
+ vmovdqu ymm13, YMMWORD PTR [rdx+608]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 1/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+640]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+672]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+704]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+736]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+768]
+ vmovdqu ymm12, YMMWORD PTR [rdx+800]
+ vmovdqu ymm11, YMMWORD PTR [rdx+832]
+ vmovdqu ymm13, YMMWORD PTR [rdx+864]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 1/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+896]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+928]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+960]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+992]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1056]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1120]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1184]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1248]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1312]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1376]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ ; 32: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1440]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1568]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1632]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1696]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1760]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1824]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 2/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+1856]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+1888]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1920]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1952]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2016]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2080]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 2/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2112]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2144]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2176]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2208]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2272]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2336]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2400]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2464]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2528]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2592]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ ; 32: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2656]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2784]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2848]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2912]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2976]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3040]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 3/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3072]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3104]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3136]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3168]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3232]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3296]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 3/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3328]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3360]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3392]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3424]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3488]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3552]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3616]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3680]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3744]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3808]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ ; 32: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3872]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4000]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4064]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4096]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4128]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4192]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4256]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 4/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4288]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4320]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4352]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4384]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4448]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4512]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 4/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4544]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4576]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4608]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4640]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4672]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4704]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4768]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4832]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4896]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5024]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_ntt_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_ntt_full_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vpxor ymm14, ymm14, ymm14
+ vmovdqu ymm14, YMMWORD PTR mldsa_q
+ ; ntt
+ mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas]
+ vmovdqu ymm11, YMMWORD PTR [rdx+64]
+ vmovdqu ymm13, YMMWORD PTR [rdx+96]
+ ; 128: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+96]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vmovdqu ymm2, YMMWORD PTR [rcx+352]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+608]
+ vmovdqu ymm5, YMMWORD PTR [rcx+736]
+ vmovdqu ymm6, YMMWORD PTR [rcx+864]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+224], ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vmovdqu YMMWORD PTR [rcx+608], ymm4
+ vmovdqu YMMWORD PTR [rcx+736], ymm5
+ vmovdqu YMMWORD PTR [rcx+864], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ ; 128: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+192]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+448]
+ vmovdqu ymm4, YMMWORD PTR [rcx+576]
+ vmovdqu ymm5, YMMWORD PTR [rcx+704]
+ vmovdqu ymm6, YMMWORD PTR [rcx+832]
+ vmovdqu ymm7, YMMWORD PTR [rcx+960]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+576], ymm4
+ vmovdqu YMMWORD PTR [rcx+704], ymm5
+ vmovdqu YMMWORD PTR [rcx+832], ymm6
+ vmovdqu YMMWORD PTR [rcx+960], ymm7
+ ; 128: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx+32]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+288]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vmovdqu ymm4, YMMWORD PTR [rcx+544]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+800]
+ vmovdqu ymm7, YMMWORD PTR [rcx+928]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ vmovdqu YMMWORD PTR [rcx+416], ymm3
+ vmovdqu YMMWORD PTR [rcx+544], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+800], ymm6
+ vmovdqu YMMWORD PTR [rcx+928], ymm7
+ ; 128: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+128]
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+384]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+640]
+ vmovdqu ymm6, YMMWORD PTR [rcx+768]
+ vmovdqu ymm7, YMMWORD PTR [rcx+896]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 64: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vpmulld ymm8, ymm2, ymm13
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ vmovdqu YMMWORD PTR [rcx+256], ymm2
+ vmovdqu YMMWORD PTR [rcx+384], ymm3
+ vmovdqu YMMWORD PTR [rcx+512], ymm4
+ vmovdqu YMMWORD PTR [rcx+640], ymm5
+ vmovdqu YMMWORD PTR [rcx+768], ymm6
+ vmovdqu YMMWORD PTR [rcx+896], ymm7
+ vmovdqu ymm4, ymm1
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ ; 32: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+192]
+ vmovdqu ymm12, YMMWORD PTR [rdx+224]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+256]
+ vmovdqu ymm12, YMMWORD PTR [rdx+288]
+ vmovdqu ymm11, YMMWORD PTR [rdx+320]
+ vmovdqu ymm13, YMMWORD PTR [rdx+352]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+384]
+ vmovdqu ymm12, YMMWORD PTR [rdx+416]
+ vmovdqu ymm11, YMMWORD PTR [rdx+448]
+ vmovdqu ymm13, YMMWORD PTR [rdx+480]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+512]
+ vmovdqu ymm12, YMMWORD PTR [rdx+544]
+ vmovdqu ymm11, YMMWORD PTR [rdx+576]
+ vmovdqu ymm13, YMMWORD PTR [rdx+608]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 1/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+640]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+672]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+704]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+736]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+768]
+ vmovdqu ymm12, YMMWORD PTR [rdx+800]
+ vmovdqu ymm11, YMMWORD PTR [rdx+832]
+ vmovdqu ymm13, YMMWORD PTR [rdx+864]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 1/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+896]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+928]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+960]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+992]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1056]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1120]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1184]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1248]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1312]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1376]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ ; 32: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1440]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1568]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1632]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1696]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1760]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1824]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 2/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+1856]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+1888]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1920]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1952]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2016]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2080]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 2/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2112]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2144]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2176]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2208]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2272]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2336]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2400]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2464]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2528]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2592]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ ; 32: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2656]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2784]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2848]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2912]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2976]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3040]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 3/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3072]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3104]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3136]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3168]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3232]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3296]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 3/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3328]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3360]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3392]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3424]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3488]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3552]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3616]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3680]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3744]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3808]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ ; 32: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3872]
+ vpmulld ymm8, ymm4, ymm12
+ vmovshdup ymm9, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm4, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm12
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vpmulld ymm8, ymm7, ymm12
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm3, ymm8
+ vpaddd ymm3, ymm3, ymm8
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4000]
+ vpmulld ymm8, ymm2, ymm12
+ vmovshdup ymm9, ymm2
+ vpmuldq ymm2, ymm2, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm2, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm12
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm1, ymm8
+ vpaddd ymm1, ymm1, ymm8
+ vpmulld ymm8, ymm6, ymm13
+ vmovshdup ymm9, ymm6
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm6, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm5, ymm8
+ vpaddd ymm5, ymm5, ymm8
+ ; 8: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4064]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4096]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4128]
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ vmovdqu ymm10, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4192]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4256]
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 4: 4/4
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4288]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4320]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4352]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4384]
+ vpmulld ymm0, ymm1, ymm12
+ vmovshdup ymm2, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm1, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm1, ymm8, ymm0
+ vpaddd ymm8, ymm8, ymm0
+ vpmulld ymm0, ymm3, ymm13
+ vmovshdup ymm2, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm0
+ vpmuldq ymm0, ymm0, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm0, ymm3, ymm0
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm0, ymm0
+ vpblendd ymm0, ymm0, ymm15, 170
+ vpsubd ymm3, ymm9, ymm0
+ vpaddd ymm9, ymm9, ymm0
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4448]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4512]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 4: 4/4
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4544]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4576]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4608]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4640]
+ vpmulld ymm4, ymm5, ymm12
+ vmovshdup ymm6, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm5, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm5, ymm8, ymm4
+ vpaddd ymm8, ymm8, ymm4
+ vpmulld ymm4, ymm7, ymm13
+ vmovshdup ymm6, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm7, ymm4
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm7, ymm9, ymm4
+ vpaddd ymm9, ymm9, ymm4
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4672]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4704]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4768]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4832]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4896]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmulld ymm8, ymm1, ymm12
+ vmovshdup ymm9, ymm1
+ vpmuldq ymm1, ymm1, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm1, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm1, ymm0, ymm8
+ vpaddd ymm0, ymm0, ymm8
+ vpmulld ymm8, ymm3, ymm13
+ vmovshdup ymm9, ymm3
+ vpmuldq ymm3, ymm3, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm3, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm3, ymm2, ymm8
+ vpaddd ymm2, ymm2, ymm8
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5024]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmulld ymm8, ymm5, ymm12
+ vmovshdup ymm9, ymm5
+ vpmuldq ymm5, ymm5, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm5, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm5, ymm4, ymm8
+ vpaddd ymm4, ymm4, ymm8
+ vpmulld ymm8, ymm7, ymm13
+ vmovshdup ymm9, ymm7
+ vpmuldq ymm7, ymm7, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm7, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm8, ymm8
+ vpblendd ymm8, ymm8, ymm15, 170
+ vpsubd ymm7, ymm6, ymm8
+ vpaddd ymm6, ymm6, ymm8
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_ntt_full_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_invntt_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vpxor ymm14, ymm14, ymm14
+ vmovdqu ymm14, YMMWORD PTR mldsa_q
+ ; invntt
+ mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas_inv]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm11, YMMWORD PTR [rdx+64]
+ vmovdqu ymm13, YMMWORD PTR [rdx+96]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vmovdqu ymm11, YMMWORD PTR [rdx+192]
+ vmovdqu ymm13, YMMWORD PTR [rdx+224]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+256]
+ vmovdqu ymm12, YMMWORD PTR [rdx+288]
+ vmovdqu ymm11, YMMWORD PTR [rdx+320]
+ vmovdqu ymm13, YMMWORD PTR [rdx+352]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 1/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+384]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+416]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+448]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+480]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+512]
+ vmovdqu ymm12, YMMWORD PTR [rdx+544]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+576]
+ vmovdqu ymm12, YMMWORD PTR [rdx+608]
+ vmovdqu ymm11, YMMWORD PTR [rdx+640]
+ vmovdqu ymm13, YMMWORD PTR [rdx+672]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+704]
+ vmovdqu ymm12, YMMWORD PTR [rdx+736]
+ vmovdqu ymm11, YMMWORD PTR [rdx+768]
+ vmovdqu ymm13, YMMWORD PTR [rdx+800]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+832]
+ vmovdqu ymm12, YMMWORD PTR [rdx+864]
+ vmovdqu ymm11, YMMWORD PTR [rdx+896]
+ vmovdqu ymm13, YMMWORD PTR [rdx+928]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 1/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+960]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+992]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1024]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1056]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1120]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1184]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1248]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1312]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1376]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1440]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1568]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 2/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+1600]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+1632]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1664]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1696]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1760]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1824]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1888]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1952]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2016]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2080]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2144]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 2/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2176]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2208]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2240]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2272]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2336]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2400]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2464]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2528]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2592]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2656]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2784]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 3/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2816]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2848]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2880]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2912]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2976]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3040]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3104]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3168]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3232]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3296]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3360]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 3/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3392]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3424]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3456]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3488]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3552]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3616]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3680]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3744]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3808]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3872]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4000]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 4/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4032]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4064]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4096]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4128]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4192]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4256]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4288]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4320]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4352]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4384]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4448]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4512]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4544]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4576]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 4/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4608]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4640]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4672]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4704]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4768]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4832]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm6, ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+96]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vmovdqu ymm2, YMMWORD PTR [rcx+352]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+608]
+ vmovdqu ymm5, YMMWORD PTR [rcx+736]
+ ; 64: 4/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 4/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+224], ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vmovdqu YMMWORD PTR [rcx+608], ymm4
+ vmovdqu YMMWORD PTR [rcx+736], ymm5
+ vmovdqu YMMWORD PTR [rcx+864], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+192]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+448]
+ vmovdqu ymm4, YMMWORD PTR [rcx+576]
+ vmovdqu ymm5, YMMWORD PTR [rcx+704]
+ vmovdqu ymm6, YMMWORD PTR [rcx+832]
+ vmovdqu ymm7, YMMWORD PTR [rcx+960]
+ ; 64: 3/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 3/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+576], ymm4
+ vmovdqu YMMWORD PTR [rcx+704], ymm5
+ vmovdqu YMMWORD PTR [rcx+832], ymm6
+ vmovdqu YMMWORD PTR [rcx+960], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx+32]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+288]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vmovdqu ymm4, YMMWORD PTR [rcx+544]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+800]
+ vmovdqu ymm7, YMMWORD PTR [rcx+928]
+ ; 64: 2/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 2/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ vmovdqu YMMWORD PTR [rcx+416], ymm3
+ vmovdqu YMMWORD PTR [rcx+544], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+800], ymm6
+ vmovdqu YMMWORD PTR [rcx+928], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+128]
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+384]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+640]
+ vmovdqu ymm6, YMMWORD PTR [rcx+768]
+ vmovdqu ymm7, YMMWORD PTR [rcx+896]
+ ; 64: 1/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 1/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm2
+ vmovdqu YMMWORD PTR [rcx+384], ymm3
+ vmovdqu YMMWORD PTR [rcx+512], ymm4
+ vmovdqu YMMWORD PTR [rcx+640], ymm5
+ vmovdqu YMMWORD PTR [rcx+768], ymm6
+ vmovdqu YMMWORD PTR [rcx+896], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_invntt_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_invntt_full_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vpxor ymm14, ymm14, ymm14
+ vmovdqu ymm14, YMMWORD PTR mldsa_q
+ ; invntt
+ mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas_inv]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR [rdx+32]
+ vmovdqu ymm11, YMMWORD PTR [rdx+64]
+ vmovdqu ymm13, YMMWORD PTR [rdx+96]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+128]
+ vmovdqu ymm12, YMMWORD PTR [rdx+160]
+ vmovdqu ymm11, YMMWORD PTR [rdx+192]
+ vmovdqu ymm13, YMMWORD PTR [rdx+224]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+256]
+ vmovdqu ymm12, YMMWORD PTR [rdx+288]
+ vmovdqu ymm11, YMMWORD PTR [rdx+320]
+ vmovdqu ymm13, YMMWORD PTR [rdx+352]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 1/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+384]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+416]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+448]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+480]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+512]
+ vmovdqu ymm12, YMMWORD PTR [rdx+544]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpermq ymm4, ymm4, 216
+ vpermq ymm5, ymm5, 216
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpermq ymm6, ymm6, 216
+ vpermq ymm7, ymm7, 216
+ ; 1: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+576]
+ vmovdqu ymm12, YMMWORD PTR [rdx+608]
+ vmovdqu ymm11, YMMWORD PTR [rdx+640]
+ vmovdqu ymm13, YMMWORD PTR [rdx+672]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+704]
+ vmovdqu ymm12, YMMWORD PTR [rdx+736]
+ vmovdqu ymm11, YMMWORD PTR [rdx+768]
+ vmovdqu ymm13, YMMWORD PTR [rdx+800]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+832]
+ vmovdqu ymm12, YMMWORD PTR [rdx+864]
+ vmovdqu ymm11, YMMWORD PTR [rdx+896]
+ vmovdqu ymm13, YMMWORD PTR [rdx+928]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 1/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+960]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+992]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1024]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1056]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1120]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 1/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1184]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1248]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1312]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1376]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1440]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1568]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 2/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+1600]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+1632]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+1664]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+1696]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1760]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpermq ymm4, ymm4, 216
+ vpermq ymm5, ymm5, 216
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpermq ymm6, ymm6, 216
+ vpermq ymm7, ymm7, 216
+ ; 1: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1824]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm13, YMMWORD PTR [rdx+1888]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm12, YMMWORD PTR [rdx+1952]
+ vmovdqu ymm11, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2016]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2080]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2144]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 2/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2176]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2208]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2240]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2272]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2336]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 2/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2400]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2464]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2528]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2592]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2656]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm11, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm13, YMMWORD PTR [rdx+2784]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 3/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+2816]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+2848]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+2880]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+2912]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm12, YMMWORD PTR [rdx+2976]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpermq ymm4, ymm4, 216
+ vpermq ymm5, ymm5, 216
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpermq ymm6, ymm6, 216
+ vpermq ymm7, ymm7, 216
+ ; 1: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3040]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3104]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3168]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3232]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3296]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3360]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 3/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+3392]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+3424]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+3456]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+3488]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3552]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 3/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3616]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3680]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3744]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3808]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm13, YMMWORD PTR [rdx+3872]
+ vpshufd ymm8, ymm0, 216
+ vpshufd ymm9, ymm1, 216
+ vpunpckldq ymm0, ymm8, ymm9
+ vpunpckhdq ymm1, ymm8, ymm9
+ vpshufd ymm8, ymm2, 216
+ vpshufd ymm9, ymm3, 216
+ vpunpckldq ymm2, ymm8, ymm9
+ vpunpckhdq ymm3, ymm8, ymm9
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 4: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm12, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm11, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4000]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm1, ymm0, ymm1
+ vpunpcklqdq ymm9, ymm2, ymm3
+ vpunpckhqdq ymm3, ymm2, ymm3
+ vpsubd ymm0, ymm8, ymm1
+ vpaddd ymm8, ymm8, ymm1
+ vpmulld ymm1, ymm0, ymm12
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm10
+ vpmuldq ymm2, ymm2, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm0, ymm1
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm0, ymm9, ymm3
+ vpaddd ymm9, ymm9, ymm3
+ vpmulld ymm3, ymm0, ymm13
+ vmovshdup ymm2, ymm0
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm2, ymm2, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm0, ymm3
+ vpsubq ymm15, ymm2, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 8: 4/4
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4032]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4064]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4096]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4128]
+ vpsubd ymm8, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpmulld ymm1, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm1
+ vpmuldq ymm1, ymm1, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm1, ymm8, ymm1
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm1, ymm1
+ vpblendd ymm1, ymm1, ymm15, 170
+ vpsubd ymm8, ymm2, ymm3
+ vpaddd ymm2, ymm2, ymm3
+ vpmulld ymm3, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4192]
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpermq ymm4, ymm4, 216
+ vpermq ymm5, ymm5, 216
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpermq ymm6, ymm6, 216
+ vpermq ymm7, ymm7, 216
+ ; 1: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4256]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4288]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4320]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vmovshdup ymm10, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vmovshdup ymm11, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 2: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4352]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4384]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4448]
+ vpshufd ymm8, ymm4, 216
+ vpshufd ymm9, ymm5, 216
+ vpunpckldq ymm4, ymm8, ymm9
+ vpunpckhdq ymm5, ymm8, ymm9
+ vpshufd ymm8, ymm6, 216
+ vpshufd ymm9, ymm7, 216
+ vpunpckldq ymm6, ymm8, ymm9
+ vpunpckhdq ymm7, ymm8, ymm9
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 4: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4512]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4544]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4576]
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm5, ymm4, ymm5
+ vpunpcklqdq ymm9, ymm6, ymm7
+ vpunpckhqdq ymm7, ymm6, ymm7
+ vpsubd ymm4, ymm8, ymm5
+ vpaddd ymm8, ymm8, ymm5
+ vpmulld ymm5, ymm4, ymm12
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm6, ymm6, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm4, ymm5
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm4, ymm9, ymm7
+ vpaddd ymm9, ymm9, ymm7
+ vpmulld ymm7, ymm4, ymm13
+ vmovshdup ymm6, ymm4
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm6, ymm6, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm4, ymm7
+ vpsubq ymm15, ymm6, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 8: 4/4
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rdx+4608]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rdx+4640]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rdx+4672]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rdx+4704]
+ vpsubd ymm8, ymm4, ymm5
+ vpaddd ymm4, ymm4, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm6, ymm7
+ vpaddd ymm6, ymm6, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 16: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4768]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 32: 4/4
+ vmovdqu ymm10, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4832]
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm6, ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+96]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vmovdqu ymm2, YMMWORD PTR [rcx+352]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+608]
+ vmovdqu ymm5, YMMWORD PTR [rcx+736]
+ ; 64: 4/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 4/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vmovdqu YMMWORD PTR [rcx+224], ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vmovdqu YMMWORD PTR [rcx+608], ymm4
+ vmovdqu YMMWORD PTR [rcx+736], ymm5
+ vmovdqu YMMWORD PTR [rcx+864], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+192]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+448]
+ vmovdqu ymm4, YMMWORD PTR [rcx+576]
+ vmovdqu ymm5, YMMWORD PTR [rcx+704]
+ vmovdqu ymm6, YMMWORD PTR [rcx+832]
+ vmovdqu ymm7, YMMWORD PTR [rcx+960]
+ ; 64: 3/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 3/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+576], ymm4
+ vmovdqu YMMWORD PTR [rcx+704], ymm5
+ vmovdqu YMMWORD PTR [rcx+832], ymm6
+ vmovdqu YMMWORD PTR [rcx+960], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx+32]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+288]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vmovdqu ymm4, YMMWORD PTR [rcx+544]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+800]
+ vmovdqu ymm7, YMMWORD PTR [rcx+928]
+ ; 64: 2/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 2/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ vmovdqu YMMWORD PTR [rcx+416], ymm3
+ vmovdqu YMMWORD PTR [rcx+544], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+800], ymm6
+ vmovdqu YMMWORD PTR [rcx+928], ymm7
+ vmovdqu ymm10, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm11, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm13, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+128]
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+384]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+640]
+ vmovdqu ymm6, YMMWORD PTR [rcx+768]
+ vmovdqu ymm7, YMMWORD PTR [rcx+896]
+ ; 64: 1/4
+ vpsubd ymm8, ymm0, ymm2
+ vpaddd ymm0, ymm0, ymm2
+ vpmulld ymm2, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm2
+ vpmuldq ymm2, ymm2, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm2, ymm8, ymm2
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm2, ymm2
+ vpblendd ymm2, ymm2, ymm15, 170
+ vpsubd ymm8, ymm1, ymm3
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm3, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm3
+ vpmuldq ymm3, ymm3, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm3, ymm8, ymm3
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm3, ymm3
+ vpblendd ymm3, ymm3, ymm15, 170
+ vmovdqu ymm10, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm12, YMMWORD PTR [rdx+5024]
+ vpsubd ymm8, ymm4, ymm6
+ vpaddd ymm4, ymm4, ymm6
+ vpmulld ymm6, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm5, ymm7
+ vpaddd ymm5, ymm5, ymm7
+ vpmulld ymm7, ymm8, ymm13
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ ; 128: 1/4
+ vpsubd ymm8, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm4
+ vpmulld ymm4, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm4
+ vpmuldq ymm4, ymm4, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm4, ymm8, ymm4
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm4, ymm4
+ vpblendd ymm4, ymm4, ymm15, 170
+ vpsubd ymm8, ymm1, ymm5
+ vpaddd ymm1, ymm1, ymm5
+ vpmulld ymm5, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm5
+ vpmuldq ymm5, ymm5, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm5, ymm8, ymm5
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm5, ymm5
+ vpblendd ymm5, ymm5, ymm15, 170
+ vmovdqu ymm11, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm13, YMMWORD PTR [rdx+5088]
+ vpsubd ymm8, ymm2, ymm6
+ vpaddd ymm2, ymm2, ymm6
+ vpmulld ymm6, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm6
+ vpmuldq ymm6, ymm6, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm6, ymm8, ymm6
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm6, ymm6
+ vpblendd ymm6, ymm6, ymm15, 170
+ vpsubd ymm8, ymm3, ymm7
+ vpaddd ymm3, ymm3, ymm7
+ vpmulld ymm7, ymm8, ymm12
+ vmovshdup ymm9, ymm8
+ vpmuldq ymm8, ymm8, ymm10
+ vpmuldq ymm9, ymm9, ymm10
+ vmovshdup ymm15, ymm7
+ vpmuldq ymm7, ymm7, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm7, ymm8, ymm7
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm7, ymm7
+ vpblendd ymm7, ymm7, ymm15, 170
+ vpmulld ymm8, ymm0, ymm13
+ vpmulld ymm10, ymm1, ymm13
+ vmovshdup ymm9, ymm0
+ vmovshdup ymm12, ymm1
+ vpmuldq ymm0, ymm0, ymm11
+ vpmuldq ymm1, ymm1, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm0, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm1, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm0, ymm8, ymm15, 170
+ vpblendd ymm1, ymm10, ymm9, 170
+ vpmulld ymm8, ymm2, ymm13
+ vpmulld ymm10, ymm3, ymm13
+ vmovshdup ymm9, ymm2
+ vmovshdup ymm12, ymm3
+ vpmuldq ymm2, ymm2, ymm11
+ vpmuldq ymm3, ymm3, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm2, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm3, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm2, ymm8, ymm15, 170
+ vpblendd ymm3, ymm10, ymm9, 170
+ vpmulld ymm8, ymm4, ymm13
+ vpmulld ymm10, ymm5, ymm13
+ vmovshdup ymm9, ymm4
+ vmovshdup ymm12, ymm5
+ vpmuldq ymm4, ymm4, ymm11
+ vpmuldq ymm5, ymm5, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm4, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm5, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm4, ymm8, ymm15, 170
+ vpblendd ymm5, ymm10, ymm9, 170
+ vpmulld ymm8, ymm6, ymm13
+ vpmulld ymm10, ymm7, ymm13
+ vmovshdup ymm9, ymm6
+ vmovshdup ymm12, ymm7
+ vpmuldq ymm6, ymm6, ymm11
+ vpmuldq ymm7, ymm7, ymm11
+ vpmuldq ymm9, ymm9, ymm11
+ vpmuldq ymm12, ymm12, ymm11
+ vmovshdup ymm15, ymm8
+ vpmuldq ymm8, ymm8, ymm14
+ vpmuldq ymm15, ymm15, ymm14
+ vpsubq ymm8, ymm6, ymm8
+ vpsubq ymm15, ymm9, ymm15
+ vmovshdup ymm9, ymm10
+ vpmuldq ymm10, ymm10, ymm14
+ vpmuldq ymm9, ymm9, ymm14
+ vpsubq ymm10, ymm7, ymm10
+ vpsubq ymm9, ymm12, ymm9
+ vmovshdup ymm8, ymm8
+ vmovshdup ymm10, ymm10
+ vpblendd ymm6, ymm8, ymm15, 170
+ vpblendd ymm7, ymm10, ymm9, 170
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm2
+ vmovdqu YMMWORD PTR [rcx+384], ymm3
+ vmovdqu YMMWORD PTR [rcx+512], ymm4
+ vmovdqu YMMWORD PTR [rcx+640], ymm5
+ vmovdqu YMMWORD PTR [rcx+768], ymm6
+ vmovdqu YMMWORD PTR [rcx+896], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_invntt_full_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_mul_avx2 PROC
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR mldsa_q
+ vmovdqu ymm9, YMMWORD PTR mldsa_qinv
+ ; 0..15
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm2, YMMWORD PTR [rdx+32]
+ vmovdqu ymm4, YMMWORD PTR [r8]
+ vmovdqu ymm6, YMMWORD PTR [r8+32]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ ; 16..31
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm2, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [r8+64]
+ vmovdqu ymm6, YMMWORD PTR [r8+96]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ ; 32..47
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm2, YMMWORD PTR [rdx+160]
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm6, YMMWORD PTR [r8+160]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm2
+ ; 48..63
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm2, YMMWORD PTR [rdx+224]
+ vmovdqu ymm4, YMMWORD PTR [r8+192]
+ vmovdqu ymm6, YMMWORD PTR [r8+224]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+192], ymm0
+ vmovdqu YMMWORD PTR [rcx+224], ymm2
+ ; 64..79
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm2, YMMWORD PTR [rdx+288]
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm6, YMMWORD PTR [r8+288]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ ; 80..95
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm2, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [r8+320]
+ vmovdqu ymm6, YMMWORD PTR [r8+352]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+320], ymm0
+ vmovdqu YMMWORD PTR [rcx+352], ymm2
+ ; 96..111
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm2, YMMWORD PTR [rdx+416]
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm6, YMMWORD PTR [r8+416]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm2
+ ; 112..127
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm2, YMMWORD PTR [rdx+480]
+ vmovdqu ymm4, YMMWORD PTR [r8+448]
+ vmovdqu ymm6, YMMWORD PTR [r8+480]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+448], ymm0
+ vmovdqu YMMWORD PTR [rcx+480], ymm2
+ ; 128..143
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm2, YMMWORD PTR [rdx+544]
+ vmovdqu ymm4, YMMWORD PTR [r8+512]
+ vmovdqu ymm6, YMMWORD PTR [r8+544]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm2
+ ; 144..159
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm2, YMMWORD PTR [rdx+608]
+ vmovdqu ymm4, YMMWORD PTR [r8+576]
+ vmovdqu ymm6, YMMWORD PTR [r8+608]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+576], ymm0
+ vmovdqu YMMWORD PTR [rcx+608], ymm2
+ ; 160..175
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm2, YMMWORD PTR [rdx+672]
+ vmovdqu ymm4, YMMWORD PTR [r8+640]
+ vmovdqu ymm6, YMMWORD PTR [r8+672]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm0
+ vmovdqu YMMWORD PTR [rcx+672], ymm2
+ ; 176..191
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm2, YMMWORD PTR [rdx+736]
+ vmovdqu ymm4, YMMWORD PTR [r8+704]
+ vmovdqu ymm6, YMMWORD PTR [r8+736]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+704], ymm0
+ vmovdqu YMMWORD PTR [rcx+736], ymm2
+ ; 192..207
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm2, YMMWORD PTR [rdx+800]
+ vmovdqu ymm4, YMMWORD PTR [r8+768]
+ vmovdqu ymm6, YMMWORD PTR [r8+800]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm2
+ ; 208..223
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm2, YMMWORD PTR [rdx+864]
+ vmovdqu ymm4, YMMWORD PTR [r8+832]
+ vmovdqu ymm6, YMMWORD PTR [r8+864]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+832], ymm0
+ vmovdqu YMMWORD PTR [rcx+864], ymm2
+ ; 224..239
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm2, YMMWORD PTR [rdx+928]
+ vmovdqu ymm4, YMMWORD PTR [r8+896]
+ vmovdqu ymm6, YMMWORD PTR [r8+928]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm0
+ vmovdqu YMMWORD PTR [rcx+928], ymm2
+ ; 240..255
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm2, YMMWORD PTR [rdx+992]
+ vmovdqu ymm4, YMMWORD PTR [r8+960]
+ vmovdqu ymm6, YMMWORD PTR [r8+992]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpmuldq ymm0, ymm0, ymm4
+ vpmuldq ymm1, ymm1, ymm5
+ vpmuldq ymm2, ymm2, ymm6
+ vpmuldq ymm3, ymm3, ymm7
+ ; Mont Reduce 2
+ vpmulld ymm4, ymm0, ymm9
+ vpmulld ymm5, ymm1, ymm9
+ vpmulld ymm6, ymm2, ymm9
+ vpmulld ymm7, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm8
+ vpmuldq ymm5, ymm5, ymm8
+ vpmuldq ymm6, ymm6, ymm8
+ vpmuldq ymm7, ymm7, ymm8
+ vpsubd ymm0, ymm0, ymm4
+ vpsubd ymm1, ymm1, ymm5
+ vpsubd ymm2, ymm2, ymm6
+ vpsubd ymm3, ymm3, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpsrlq ymm2, ymm2, 32
+ vpor ymm0, ymm0, ymm1
+ vpor ymm2, ymm2, ymm3
+ vmovdqu YMMWORD PTR [rcx+960], ymm0
+ vmovdqu YMMWORD PTR [rcx+992], ymm2
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ ret
+wc_mldsa_mul_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_mul_vec_4_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vpxor ymm12, ymm12, ymm12
+ vmovdqu ymm12, YMMWORD PTR mldsa_q
+ vmovdqu ymm13, YMMWORD PTR mldsa_qinv
+ ; 0..7
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm6, YMMWORD PTR [r8]
+ vmovdqu ymm8, YMMWORD PTR [r8+1024]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm8, YMMWORD PTR [r8+2048]
+ vmovdqu ymm10, YMMWORD PTR [r8+3072]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ ; 8..15
+ vmovdqu ymm0, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1056]
+ vmovdqu ymm6, YMMWORD PTR [r8+32]
+ vmovdqu ymm8, YMMWORD PTR [r8+1056]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2080]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3104]
+ vmovdqu ymm8, YMMWORD PTR [r8+2080]
+ vmovdqu ymm10, YMMWORD PTR [r8+3104]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ ; 16..23
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm6, YMMWORD PTR [r8+64]
+ vmovdqu ymm8, YMMWORD PTR [r8+1088]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm8, YMMWORD PTR [r8+2112]
+ vmovdqu ymm10, YMMWORD PTR [r8+3136]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ ; 24..31
+ vmovdqu ymm0, YMMWORD PTR [rdx+96]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1120]
+ vmovdqu ymm6, YMMWORD PTR [r8+96]
+ vmovdqu ymm8, YMMWORD PTR [r8+1120]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2144]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3168]
+ vmovdqu ymm8, YMMWORD PTR [r8+2144]
+ vmovdqu ymm10, YMMWORD PTR [r8+3168]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ ; 32..39
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm6, YMMWORD PTR [r8+128]
+ vmovdqu ymm8, YMMWORD PTR [r8+1152]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2176]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm8, YMMWORD PTR [r8+2176]
+ vmovdqu ymm10, YMMWORD PTR [r8+3200]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ ; 40..47
+ vmovdqu ymm0, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1184]
+ vmovdqu ymm6, YMMWORD PTR [r8+160]
+ vmovdqu ymm8, YMMWORD PTR [r8+1184]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2208]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3232]
+ vmovdqu ymm8, YMMWORD PTR [r8+2208]
+ vmovdqu ymm10, YMMWORD PTR [r8+3232]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm0
+ ; 48..55
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm8, YMMWORD PTR [r8+1216]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm8, YMMWORD PTR [r8+2240]
+ vmovdqu ymm10, YMMWORD PTR [r8+3264]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm0
+ ; 56..63
+ vmovdqu ymm0, YMMWORD PTR [rdx+224]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1248]
+ vmovdqu ymm6, YMMWORD PTR [r8+224]
+ vmovdqu ymm8, YMMWORD PTR [r8+1248]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2272]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3296]
+ vmovdqu ymm8, YMMWORD PTR [r8+2272]
+ vmovdqu ymm10, YMMWORD PTR [r8+3296]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm0
+ ; 64..71
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm6, YMMWORD PTR [r8+256]
+ vmovdqu ymm8, YMMWORD PTR [r8+1280]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm8, YMMWORD PTR [r8+2304]
+ vmovdqu ymm10, YMMWORD PTR [r8+3328]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ ; 72..79
+ vmovdqu ymm0, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1312]
+ vmovdqu ymm6, YMMWORD PTR [r8+288]
+ vmovdqu ymm8, YMMWORD PTR [r8+1312]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2336]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3360]
+ vmovdqu ymm8, YMMWORD PTR [r8+2336]
+ vmovdqu ymm10, YMMWORD PTR [r8+3360]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm0
+ ; 80..87
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm8, YMMWORD PTR [r8+1344]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3392]
+ vmovdqu ymm8, YMMWORD PTR [r8+2368]
+ vmovdqu ymm10, YMMWORD PTR [r8+3392]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm0
+ ; 88..95
+ vmovdqu ymm0, YMMWORD PTR [rdx+352]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1376]
+ vmovdqu ymm6, YMMWORD PTR [r8+352]
+ vmovdqu ymm8, YMMWORD PTR [r8+1376]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2400]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3424]
+ vmovdqu ymm8, YMMWORD PTR [r8+2400]
+ vmovdqu ymm10, YMMWORD PTR [r8+3424]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm0
+ ; 96..103
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm6, YMMWORD PTR [r8+384]
+ vmovdqu ymm8, YMMWORD PTR [r8+1408]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm8, YMMWORD PTR [r8+2432]
+ vmovdqu ymm10, YMMWORD PTR [r8+3456]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ ; 104..111
+ vmovdqu ymm0, YMMWORD PTR [rdx+416]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1440]
+ vmovdqu ymm6, YMMWORD PTR [r8+416]
+ vmovdqu ymm8, YMMWORD PTR [r8+1440]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2464]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3488]
+ vmovdqu ymm8, YMMWORD PTR [r8+2464]
+ vmovdqu ymm10, YMMWORD PTR [r8+3488]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+416], ymm0
+ ; 112..119
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm8, YMMWORD PTR [r8+1472]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm8, YMMWORD PTR [r8+2496]
+ vmovdqu ymm10, YMMWORD PTR [r8+3520]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm0
+ ; 120..127
+ vmovdqu ymm0, YMMWORD PTR [rdx+480]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm6, YMMWORD PTR [r8+480]
+ vmovdqu ymm8, YMMWORD PTR [r8+1504]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2528]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3552]
+ vmovdqu ymm8, YMMWORD PTR [r8+2528]
+ vmovdqu ymm10, YMMWORD PTR [r8+3552]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+480], ymm0
+ ; 128..135
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm6, YMMWORD PTR [r8+512]
+ vmovdqu ymm8, YMMWORD PTR [r8+1536]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm8, YMMWORD PTR [r8+2560]
+ vmovdqu ymm10, YMMWORD PTR [r8+3584]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ ; 136..143
+ vmovdqu ymm0, YMMWORD PTR [rdx+544]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1568]
+ vmovdqu ymm6, YMMWORD PTR [r8+544]
+ vmovdqu ymm8, YMMWORD PTR [r8+1568]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2592]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3616]
+ vmovdqu ymm8, YMMWORD PTR [r8+2592]
+ vmovdqu ymm10, YMMWORD PTR [r8+3616]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+544], ymm0
+ ; 144..151
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm6, YMMWORD PTR [r8+576]
+ vmovdqu ymm8, YMMWORD PTR [r8+1600]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm8, YMMWORD PTR [r8+2624]
+ vmovdqu ymm10, YMMWORD PTR [r8+3648]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm0
+ ; 152..159
+ vmovdqu ymm0, YMMWORD PTR [rdx+608]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1632]
+ vmovdqu ymm6, YMMWORD PTR [r8+608]
+ vmovdqu ymm8, YMMWORD PTR [r8+1632]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2656]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3680]
+ vmovdqu ymm8, YMMWORD PTR [r8+2656]
+ vmovdqu ymm10, YMMWORD PTR [r8+3680]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+608], ymm0
+ ; 160..167
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm6, YMMWORD PTR [r8+640]
+ vmovdqu ymm8, YMMWORD PTR [r8+1664]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm8, YMMWORD PTR [r8+2688]
+ vmovdqu ymm10, YMMWORD PTR [r8+3712]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+640], ymm0
+ ; 168..175
+ vmovdqu ymm0, YMMWORD PTR [rdx+672]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1696]
+ vmovdqu ymm6, YMMWORD PTR [r8+672]
+ vmovdqu ymm8, YMMWORD PTR [r8+1696]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3744]
+ vmovdqu ymm8, YMMWORD PTR [r8+2720]
+ vmovdqu ymm10, YMMWORD PTR [r8+3744]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+672], ymm0
+ ; 176..183
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm6, YMMWORD PTR [r8+704]
+ vmovdqu ymm8, YMMWORD PTR [r8+1728]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm8, YMMWORD PTR [r8+2752]
+ vmovdqu ymm10, YMMWORD PTR [r8+3776]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+704], ymm0
+ ; 184..191
+ vmovdqu ymm0, YMMWORD PTR [rdx+736]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1760]
+ vmovdqu ymm6, YMMWORD PTR [r8+736]
+ vmovdqu ymm8, YMMWORD PTR [r8+1760]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2784]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3808]
+ vmovdqu ymm8, YMMWORD PTR [r8+2784]
+ vmovdqu ymm10, YMMWORD PTR [r8+3808]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+736], ymm0
+ ; 192..199
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm6, YMMWORD PTR [r8+768]
+ vmovdqu ymm8, YMMWORD PTR [r8+1792]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm8, YMMWORD PTR [r8+2816]
+ vmovdqu ymm10, YMMWORD PTR [r8+3840]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ ; 200..207
+ vmovdqu ymm0, YMMWORD PTR [rdx+800]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1824]
+ vmovdqu ymm6, YMMWORD PTR [r8+800]
+ vmovdqu ymm8, YMMWORD PTR [r8+1824]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2848]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3872]
+ vmovdqu ymm8, YMMWORD PTR [r8+2848]
+ vmovdqu ymm10, YMMWORD PTR [r8+3872]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+800], ymm0
+ ; 208..215
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm6, YMMWORD PTR [r8+832]
+ vmovdqu ymm8, YMMWORD PTR [r8+1856]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm8, YMMWORD PTR [r8+2880]
+ vmovdqu ymm10, YMMWORD PTR [r8+3904]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm0
+ ; 216..223
+ vmovdqu ymm0, YMMWORD PTR [rdx+864]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1888]
+ vmovdqu ymm6, YMMWORD PTR [r8+864]
+ vmovdqu ymm8, YMMWORD PTR [r8+1888]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2912]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm8, YMMWORD PTR [r8+2912]
+ vmovdqu ymm10, YMMWORD PTR [r8+3936]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+864], ymm0
+ ; 224..231
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm6, YMMWORD PTR [r8+896]
+ vmovdqu ymm8, YMMWORD PTR [r8+1920]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm8, YMMWORD PTR [r8+2944]
+ vmovdqu ymm10, YMMWORD PTR [r8+3968]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+896], ymm0
+ ; 232..239
+ vmovdqu ymm0, YMMWORD PTR [rdx+928]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1952]
+ vmovdqu ymm6, YMMWORD PTR [r8+928]
+ vmovdqu ymm8, YMMWORD PTR [r8+1952]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+2976]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4000]
+ vmovdqu ymm8, YMMWORD PTR [r8+2976]
+ vmovdqu ymm10, YMMWORD PTR [r8+4000]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+928], ymm0
+ ; 240..247
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm6, YMMWORD PTR [r8+960]
+ vmovdqu ymm8, YMMWORD PTR [r8+1984]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm8, YMMWORD PTR [r8+3008]
+ vmovdqu ymm10, YMMWORD PTR [r8+4032]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+960], ymm0
+ ; 248..255
+ vmovdqu ymm0, YMMWORD PTR [rdx+992]
+ vmovdqu ymm2, YMMWORD PTR [rdx+2016]
+ vmovdqu ymm6, YMMWORD PTR [r8+992]
+ vmovdqu ymm8, YMMWORD PTR [r8+2016]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vmovdqu ymm2, YMMWORD PTR [rdx+3040]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4064]
+ vmovdqu ymm8, YMMWORD PTR [r8+3040]
+ vmovdqu ymm10, YMMWORD PTR [r8+4064]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+992], ymm0
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+wc_mldsa_mul_vec_4_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_mul_vec_5_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vpxor ymm12, ymm12, ymm12
+ vmovdqu ymm12, YMMWORD PTR mldsa_q
+ vmovdqu ymm13, YMMWORD PTR mldsa_qinv
+ ; 0..7
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm6, YMMWORD PTR [r8]
+ vmovdqu ymm8, YMMWORD PTR [r8+1024]
+ vmovdqu ymm10, YMMWORD PTR [r8+2048]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4096]
+ vmovdqu ymm8, YMMWORD PTR [r8+3072]
+ vmovdqu ymm10, YMMWORD PTR [r8+4096]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ ; 8..15
+ vmovdqu ymm0, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1056]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2080]
+ vmovdqu ymm6, YMMWORD PTR [r8+32]
+ vmovdqu ymm8, YMMWORD PTR [r8+1056]
+ vmovdqu ymm10, YMMWORD PTR [r8+2080]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3104]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4128]
+ vmovdqu ymm8, YMMWORD PTR [r8+3104]
+ vmovdqu ymm10, YMMWORD PTR [r8+4128]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ ; 16..23
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm6, YMMWORD PTR [r8+64]
+ vmovdqu ymm8, YMMWORD PTR [r8+1088]
+ vmovdqu ymm10, YMMWORD PTR [r8+2112]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm8, YMMWORD PTR [r8+3136]
+ vmovdqu ymm10, YMMWORD PTR [r8+4160]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ ; 24..31
+ vmovdqu ymm0, YMMWORD PTR [rdx+96]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1120]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2144]
+ vmovdqu ymm6, YMMWORD PTR [r8+96]
+ vmovdqu ymm8, YMMWORD PTR [r8+1120]
+ vmovdqu ymm10, YMMWORD PTR [r8+2144]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3168]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4192]
+ vmovdqu ymm8, YMMWORD PTR [r8+3168]
+ vmovdqu ymm10, YMMWORD PTR [r8+4192]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ ; 32..39
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2176]
+ vmovdqu ymm6, YMMWORD PTR [r8+128]
+ vmovdqu ymm8, YMMWORD PTR [r8+1152]
+ vmovdqu ymm10, YMMWORD PTR [r8+2176]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm8, YMMWORD PTR [r8+3200]
+ vmovdqu ymm10, YMMWORD PTR [r8+4224]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ ; 40..47
+ vmovdqu ymm0, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1184]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2208]
+ vmovdqu ymm6, YMMWORD PTR [r8+160]
+ vmovdqu ymm8, YMMWORD PTR [r8+1184]
+ vmovdqu ymm10, YMMWORD PTR [r8+2208]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3232]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4256]
+ vmovdqu ymm8, YMMWORD PTR [r8+3232]
+ vmovdqu ymm10, YMMWORD PTR [r8+4256]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm0
+ ; 48..55
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm8, YMMWORD PTR [r8+1216]
+ vmovdqu ymm10, YMMWORD PTR [r8+2240]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4288]
+ vmovdqu ymm8, YMMWORD PTR [r8+3264]
+ vmovdqu ymm10, YMMWORD PTR [r8+4288]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm0
+ ; 56..63
+ vmovdqu ymm0, YMMWORD PTR [rdx+224]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1248]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2272]
+ vmovdqu ymm6, YMMWORD PTR [r8+224]
+ vmovdqu ymm8, YMMWORD PTR [r8+1248]
+ vmovdqu ymm10, YMMWORD PTR [r8+2272]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3296]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4320]
+ vmovdqu ymm8, YMMWORD PTR [r8+3296]
+ vmovdqu ymm10, YMMWORD PTR [r8+4320]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm0
+ ; 64..71
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm6, YMMWORD PTR [r8+256]
+ vmovdqu ymm8, YMMWORD PTR [r8+1280]
+ vmovdqu ymm10, YMMWORD PTR [r8+2304]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4352]
+ vmovdqu ymm8, YMMWORD PTR [r8+3328]
+ vmovdqu ymm10, YMMWORD PTR [r8+4352]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ ; 72..79
+ vmovdqu ymm0, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1312]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2336]
+ vmovdqu ymm6, YMMWORD PTR [r8+288]
+ vmovdqu ymm8, YMMWORD PTR [r8+1312]
+ vmovdqu ymm10, YMMWORD PTR [r8+2336]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3360]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4384]
+ vmovdqu ymm8, YMMWORD PTR [r8+3360]
+ vmovdqu ymm10, YMMWORD PTR [r8+4384]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm0
+ ; 80..87
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm8, YMMWORD PTR [r8+1344]
+ vmovdqu ymm10, YMMWORD PTR [r8+2368]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3392]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm8, YMMWORD PTR [r8+3392]
+ vmovdqu ymm10, YMMWORD PTR [r8+4416]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm0
+ ; 88..95
+ vmovdqu ymm0, YMMWORD PTR [rdx+352]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1376]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2400]
+ vmovdqu ymm6, YMMWORD PTR [r8+352]
+ vmovdqu ymm8, YMMWORD PTR [r8+1376]
+ vmovdqu ymm10, YMMWORD PTR [r8+2400]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3424]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4448]
+ vmovdqu ymm8, YMMWORD PTR [r8+3424]
+ vmovdqu ymm10, YMMWORD PTR [r8+4448]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm0
+ ; 96..103
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm6, YMMWORD PTR [r8+384]
+ vmovdqu ymm8, YMMWORD PTR [r8+1408]
+ vmovdqu ymm10, YMMWORD PTR [r8+2432]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm8, YMMWORD PTR [r8+3456]
+ vmovdqu ymm10, YMMWORD PTR [r8+4480]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ ; 104..111
+ vmovdqu ymm0, YMMWORD PTR [rdx+416]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1440]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2464]
+ vmovdqu ymm6, YMMWORD PTR [r8+416]
+ vmovdqu ymm8, YMMWORD PTR [r8+1440]
+ vmovdqu ymm10, YMMWORD PTR [r8+2464]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3488]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4512]
+ vmovdqu ymm8, YMMWORD PTR [r8+3488]
+ vmovdqu ymm10, YMMWORD PTR [r8+4512]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+416], ymm0
+ ; 112..119
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm8, YMMWORD PTR [r8+1472]
+ vmovdqu ymm10, YMMWORD PTR [r8+2496]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4544]
+ vmovdqu ymm8, YMMWORD PTR [r8+3520]
+ vmovdqu ymm10, YMMWORD PTR [r8+4544]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm0
+ ; 120..127
+ vmovdqu ymm0, YMMWORD PTR [rdx+480]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2528]
+ vmovdqu ymm6, YMMWORD PTR [r8+480]
+ vmovdqu ymm8, YMMWORD PTR [r8+1504]
+ vmovdqu ymm10, YMMWORD PTR [r8+2528]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3552]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4576]
+ vmovdqu ymm8, YMMWORD PTR [r8+3552]
+ vmovdqu ymm10, YMMWORD PTR [r8+4576]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+480], ymm0
+ ; 128..135
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm6, YMMWORD PTR [r8+512]
+ vmovdqu ymm8, YMMWORD PTR [r8+1536]
+ vmovdqu ymm10, YMMWORD PTR [r8+2560]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4608]
+ vmovdqu ymm8, YMMWORD PTR [r8+3584]
+ vmovdqu ymm10, YMMWORD PTR [r8+4608]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ ; 136..143
+ vmovdqu ymm0, YMMWORD PTR [rdx+544]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1568]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2592]
+ vmovdqu ymm6, YMMWORD PTR [r8+544]
+ vmovdqu ymm8, YMMWORD PTR [r8+1568]
+ vmovdqu ymm10, YMMWORD PTR [r8+2592]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3616]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4640]
+ vmovdqu ymm8, YMMWORD PTR [r8+3616]
+ vmovdqu ymm10, YMMWORD PTR [r8+4640]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+544], ymm0
+ ; 144..151
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm6, YMMWORD PTR [r8+576]
+ vmovdqu ymm8, YMMWORD PTR [r8+1600]
+ vmovdqu ymm10, YMMWORD PTR [r8+2624]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4672]
+ vmovdqu ymm8, YMMWORD PTR [r8+3648]
+ vmovdqu ymm10, YMMWORD PTR [r8+4672]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm0
+ ; 152..159
+ vmovdqu ymm0, YMMWORD PTR [rdx+608]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1632]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2656]
+ vmovdqu ymm6, YMMWORD PTR [r8+608]
+ vmovdqu ymm8, YMMWORD PTR [r8+1632]
+ vmovdqu ymm10, YMMWORD PTR [r8+2656]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3680]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4704]
+ vmovdqu ymm8, YMMWORD PTR [r8+3680]
+ vmovdqu ymm10, YMMWORD PTR [r8+4704]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+608], ymm0
+ ; 160..167
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm6, YMMWORD PTR [r8+640]
+ vmovdqu ymm8, YMMWORD PTR [r8+1664]
+ vmovdqu ymm10, YMMWORD PTR [r8+2688]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm8, YMMWORD PTR [r8+3712]
+ vmovdqu ymm10, YMMWORD PTR [r8+4736]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+640], ymm0
+ ; 168..175
+ vmovdqu ymm0, YMMWORD PTR [rdx+672]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1696]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm6, YMMWORD PTR [r8+672]
+ vmovdqu ymm8, YMMWORD PTR [r8+1696]
+ vmovdqu ymm10, YMMWORD PTR [r8+2720]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3744]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4768]
+ vmovdqu ymm8, YMMWORD PTR [r8+3744]
+ vmovdqu ymm10, YMMWORD PTR [r8+4768]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+672], ymm0
+ ; 176..183
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm6, YMMWORD PTR [r8+704]
+ vmovdqu ymm8, YMMWORD PTR [r8+1728]
+ vmovdqu ymm10, YMMWORD PTR [r8+2752]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm8, YMMWORD PTR [r8+3776]
+ vmovdqu ymm10, YMMWORD PTR [r8+4800]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+704], ymm0
+ ; 184..191
+ vmovdqu ymm0, YMMWORD PTR [rdx+736]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1760]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2784]
+ vmovdqu ymm6, YMMWORD PTR [r8+736]
+ vmovdqu ymm8, YMMWORD PTR [r8+1760]
+ vmovdqu ymm10, YMMWORD PTR [r8+2784]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3808]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4832]
+ vmovdqu ymm8, YMMWORD PTR [r8+3808]
+ vmovdqu ymm10, YMMWORD PTR [r8+4832]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+736], ymm0
+ ; 192..199
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm6, YMMWORD PTR [r8+768]
+ vmovdqu ymm8, YMMWORD PTR [r8+1792]
+ vmovdqu ymm10, YMMWORD PTR [r8+2816]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm8, YMMWORD PTR [r8+3840]
+ vmovdqu ymm10, YMMWORD PTR [r8+4864]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ ; 200..207
+ vmovdqu ymm0, YMMWORD PTR [rdx+800]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1824]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2848]
+ vmovdqu ymm6, YMMWORD PTR [r8+800]
+ vmovdqu ymm8, YMMWORD PTR [r8+1824]
+ vmovdqu ymm10, YMMWORD PTR [r8+2848]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3872]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm8, YMMWORD PTR [r8+3872]
+ vmovdqu ymm10, YMMWORD PTR [r8+4896]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+800], ymm0
+ ; 208..215
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm6, YMMWORD PTR [r8+832]
+ vmovdqu ymm8, YMMWORD PTR [r8+1856]
+ vmovdqu ymm10, YMMWORD PTR [r8+2880]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm8, YMMWORD PTR [r8+3904]
+ vmovdqu ymm10, YMMWORD PTR [r8+4928]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm0
+ ; 216..223
+ vmovdqu ymm0, YMMWORD PTR [rdx+864]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1888]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2912]
+ vmovdqu ymm6, YMMWORD PTR [r8+864]
+ vmovdqu ymm8, YMMWORD PTR [r8+1888]
+ vmovdqu ymm10, YMMWORD PTR [r8+2912]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm8, YMMWORD PTR [r8+3936]
+ vmovdqu ymm10, YMMWORD PTR [r8+4960]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+864], ymm0
+ ; 224..231
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm6, YMMWORD PTR [r8+896]
+ vmovdqu ymm8, YMMWORD PTR [r8+1920]
+ vmovdqu ymm10, YMMWORD PTR [r8+2944]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm8, YMMWORD PTR [r8+3968]
+ vmovdqu ymm10, YMMWORD PTR [r8+4992]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+896], ymm0
+ ; 232..239
+ vmovdqu ymm0, YMMWORD PTR [rdx+928]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1952]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2976]
+ vmovdqu ymm6, YMMWORD PTR [r8+928]
+ vmovdqu ymm8, YMMWORD PTR [r8+1952]
+ vmovdqu ymm10, YMMWORD PTR [r8+2976]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4000]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5024]
+ vmovdqu ymm8, YMMWORD PTR [r8+4000]
+ vmovdqu ymm10, YMMWORD PTR [r8+5024]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+928], ymm0
+ ; 240..247
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm6, YMMWORD PTR [r8+960]
+ vmovdqu ymm8, YMMWORD PTR [r8+1984]
+ vmovdqu ymm10, YMMWORD PTR [r8+3008]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm8, YMMWORD PTR [r8+4032]
+ vmovdqu ymm10, YMMWORD PTR [r8+5056]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+960], ymm0
+ ; 248..255
+ vmovdqu ymm0, YMMWORD PTR [rdx+992]
+ vmovdqu ymm2, YMMWORD PTR [rdx+2016]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3040]
+ vmovdqu ymm6, YMMWORD PTR [r8+992]
+ vmovdqu ymm8, YMMWORD PTR [r8+2016]
+ vmovdqu ymm10, YMMWORD PTR [r8+3040]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4064]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5088]
+ vmovdqu ymm8, YMMWORD PTR [r8+4064]
+ vmovdqu ymm10, YMMWORD PTR [r8+5088]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+992], ymm0
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+wc_mldsa_mul_vec_5_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_mul_vec_7_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vpxor ymm12, ymm12, ymm12
+ vmovdqu ymm12, YMMWORD PTR mldsa_q
+ vmovdqu ymm13, YMMWORD PTR mldsa_qinv
+ ; 0..7
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm6, YMMWORD PTR [r8]
+ vmovdqu ymm8, YMMWORD PTR [r8+1024]
+ vmovdqu ymm10, YMMWORD PTR [r8+2048]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4096]
+ vmovdqu ymm8, YMMWORD PTR [r8+3072]
+ vmovdqu ymm10, YMMWORD PTR [r8+4096]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5120]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6144]
+ vmovdqu ymm8, YMMWORD PTR [r8+5120]
+ vmovdqu ymm10, YMMWORD PTR [r8+6144]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ ; 8..15
+ vmovdqu ymm0, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1056]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2080]
+ vmovdqu ymm6, YMMWORD PTR [r8+32]
+ vmovdqu ymm8, YMMWORD PTR [r8+1056]
+ vmovdqu ymm10, YMMWORD PTR [r8+2080]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3104]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4128]
+ vmovdqu ymm8, YMMWORD PTR [r8+3104]
+ vmovdqu ymm10, YMMWORD PTR [r8+4128]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5152]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6176]
+ vmovdqu ymm8, YMMWORD PTR [r8+5152]
+ vmovdqu ymm10, YMMWORD PTR [r8+6176]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ ; 16..23
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm6, YMMWORD PTR [r8+64]
+ vmovdqu ymm8, YMMWORD PTR [r8+1088]
+ vmovdqu ymm10, YMMWORD PTR [r8+2112]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4160]
+ vmovdqu ymm8, YMMWORD PTR [r8+3136]
+ vmovdqu ymm10, YMMWORD PTR [r8+4160]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5184]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6208]
+ vmovdqu ymm8, YMMWORD PTR [r8+5184]
+ vmovdqu ymm10, YMMWORD PTR [r8+6208]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ ; 24..31
+ vmovdqu ymm0, YMMWORD PTR [rdx+96]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1120]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2144]
+ vmovdqu ymm6, YMMWORD PTR [r8+96]
+ vmovdqu ymm8, YMMWORD PTR [r8+1120]
+ vmovdqu ymm10, YMMWORD PTR [r8+2144]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3168]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4192]
+ vmovdqu ymm8, YMMWORD PTR [r8+3168]
+ vmovdqu ymm10, YMMWORD PTR [r8+4192]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5216]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6240]
+ vmovdqu ymm8, YMMWORD PTR [r8+5216]
+ vmovdqu ymm10, YMMWORD PTR [r8+6240]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ ; 32..39
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2176]
+ vmovdqu ymm6, YMMWORD PTR [r8+128]
+ vmovdqu ymm8, YMMWORD PTR [r8+1152]
+ vmovdqu ymm10, YMMWORD PTR [r8+2176]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4224]
+ vmovdqu ymm8, YMMWORD PTR [r8+3200]
+ vmovdqu ymm10, YMMWORD PTR [r8+4224]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5248]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6272]
+ vmovdqu ymm8, YMMWORD PTR [r8+5248]
+ vmovdqu ymm10, YMMWORD PTR [r8+6272]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ ; 40..47
+ vmovdqu ymm0, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1184]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2208]
+ vmovdqu ymm6, YMMWORD PTR [r8+160]
+ vmovdqu ymm8, YMMWORD PTR [r8+1184]
+ vmovdqu ymm10, YMMWORD PTR [r8+2208]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3232]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4256]
+ vmovdqu ymm8, YMMWORD PTR [r8+3232]
+ vmovdqu ymm10, YMMWORD PTR [r8+4256]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5280]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6304]
+ vmovdqu ymm8, YMMWORD PTR [r8+5280]
+ vmovdqu ymm10, YMMWORD PTR [r8+6304]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm0
+ ; 48..55
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm8, YMMWORD PTR [r8+1216]
+ vmovdqu ymm10, YMMWORD PTR [r8+2240]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4288]
+ vmovdqu ymm8, YMMWORD PTR [r8+3264]
+ vmovdqu ymm10, YMMWORD PTR [r8+4288]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5312]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6336]
+ vmovdqu ymm8, YMMWORD PTR [r8+5312]
+ vmovdqu ymm10, YMMWORD PTR [r8+6336]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm0
+ ; 56..63
+ vmovdqu ymm0, YMMWORD PTR [rdx+224]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1248]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2272]
+ vmovdqu ymm6, YMMWORD PTR [r8+224]
+ vmovdqu ymm8, YMMWORD PTR [r8+1248]
+ vmovdqu ymm10, YMMWORD PTR [r8+2272]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3296]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4320]
+ vmovdqu ymm8, YMMWORD PTR [r8+3296]
+ vmovdqu ymm10, YMMWORD PTR [r8+4320]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5344]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6368]
+ vmovdqu ymm8, YMMWORD PTR [r8+5344]
+ vmovdqu ymm10, YMMWORD PTR [r8+6368]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm0
+ ; 64..71
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm6, YMMWORD PTR [r8+256]
+ vmovdqu ymm8, YMMWORD PTR [r8+1280]
+ vmovdqu ymm10, YMMWORD PTR [r8+2304]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4352]
+ vmovdqu ymm8, YMMWORD PTR [r8+3328]
+ vmovdqu ymm10, YMMWORD PTR [r8+4352]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5376]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6400]
+ vmovdqu ymm8, YMMWORD PTR [r8+5376]
+ vmovdqu ymm10, YMMWORD PTR [r8+6400]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ ; 72..79
+ vmovdqu ymm0, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1312]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2336]
+ vmovdqu ymm6, YMMWORD PTR [r8+288]
+ vmovdqu ymm8, YMMWORD PTR [r8+1312]
+ vmovdqu ymm10, YMMWORD PTR [r8+2336]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3360]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4384]
+ vmovdqu ymm8, YMMWORD PTR [r8+3360]
+ vmovdqu ymm10, YMMWORD PTR [r8+4384]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5408]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6432]
+ vmovdqu ymm8, YMMWORD PTR [r8+5408]
+ vmovdqu ymm10, YMMWORD PTR [r8+6432]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm0
+ ; 80..87
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm8, YMMWORD PTR [r8+1344]
+ vmovdqu ymm10, YMMWORD PTR [r8+2368]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3392]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4416]
+ vmovdqu ymm8, YMMWORD PTR [r8+3392]
+ vmovdqu ymm10, YMMWORD PTR [r8+4416]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5440]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6464]
+ vmovdqu ymm8, YMMWORD PTR [r8+5440]
+ vmovdqu ymm10, YMMWORD PTR [r8+6464]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm0
+ ; 88..95
+ vmovdqu ymm0, YMMWORD PTR [rdx+352]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1376]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2400]
+ vmovdqu ymm6, YMMWORD PTR [r8+352]
+ vmovdqu ymm8, YMMWORD PTR [r8+1376]
+ vmovdqu ymm10, YMMWORD PTR [r8+2400]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3424]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4448]
+ vmovdqu ymm8, YMMWORD PTR [r8+3424]
+ vmovdqu ymm10, YMMWORD PTR [r8+4448]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5472]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6496]
+ vmovdqu ymm8, YMMWORD PTR [r8+5472]
+ vmovdqu ymm10, YMMWORD PTR [r8+6496]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm0
+ ; 96..103
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm6, YMMWORD PTR [r8+384]
+ vmovdqu ymm8, YMMWORD PTR [r8+1408]
+ vmovdqu ymm10, YMMWORD PTR [r8+2432]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4480]
+ vmovdqu ymm8, YMMWORD PTR [r8+3456]
+ vmovdqu ymm10, YMMWORD PTR [r8+4480]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5504]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6528]
+ vmovdqu ymm8, YMMWORD PTR [r8+5504]
+ vmovdqu ymm10, YMMWORD PTR [r8+6528]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ ; 104..111
+ vmovdqu ymm0, YMMWORD PTR [rdx+416]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1440]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2464]
+ vmovdqu ymm6, YMMWORD PTR [r8+416]
+ vmovdqu ymm8, YMMWORD PTR [r8+1440]
+ vmovdqu ymm10, YMMWORD PTR [r8+2464]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3488]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4512]
+ vmovdqu ymm8, YMMWORD PTR [r8+3488]
+ vmovdqu ymm10, YMMWORD PTR [r8+4512]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5536]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6560]
+ vmovdqu ymm8, YMMWORD PTR [r8+5536]
+ vmovdqu ymm10, YMMWORD PTR [r8+6560]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+416], ymm0
+ ; 112..119
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm8, YMMWORD PTR [r8+1472]
+ vmovdqu ymm10, YMMWORD PTR [r8+2496]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4544]
+ vmovdqu ymm8, YMMWORD PTR [r8+3520]
+ vmovdqu ymm10, YMMWORD PTR [r8+4544]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5568]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6592]
+ vmovdqu ymm8, YMMWORD PTR [r8+5568]
+ vmovdqu ymm10, YMMWORD PTR [r8+6592]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm0
+ ; 120..127
+ vmovdqu ymm0, YMMWORD PTR [rdx+480]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1504]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2528]
+ vmovdqu ymm6, YMMWORD PTR [r8+480]
+ vmovdqu ymm8, YMMWORD PTR [r8+1504]
+ vmovdqu ymm10, YMMWORD PTR [r8+2528]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3552]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4576]
+ vmovdqu ymm8, YMMWORD PTR [r8+3552]
+ vmovdqu ymm10, YMMWORD PTR [r8+4576]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5600]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6624]
+ vmovdqu ymm8, YMMWORD PTR [r8+5600]
+ vmovdqu ymm10, YMMWORD PTR [r8+6624]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+480], ymm0
+ ; 128..135
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm6, YMMWORD PTR [r8+512]
+ vmovdqu ymm8, YMMWORD PTR [r8+1536]
+ vmovdqu ymm10, YMMWORD PTR [r8+2560]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4608]
+ vmovdqu ymm8, YMMWORD PTR [r8+3584]
+ vmovdqu ymm10, YMMWORD PTR [r8+4608]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5632]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6656]
+ vmovdqu ymm8, YMMWORD PTR [r8+5632]
+ vmovdqu ymm10, YMMWORD PTR [r8+6656]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ ; 136..143
+ vmovdqu ymm0, YMMWORD PTR [rdx+544]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1568]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2592]
+ vmovdqu ymm6, YMMWORD PTR [r8+544]
+ vmovdqu ymm8, YMMWORD PTR [r8+1568]
+ vmovdqu ymm10, YMMWORD PTR [r8+2592]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3616]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4640]
+ vmovdqu ymm8, YMMWORD PTR [r8+3616]
+ vmovdqu ymm10, YMMWORD PTR [r8+4640]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5664]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6688]
+ vmovdqu ymm8, YMMWORD PTR [r8+5664]
+ vmovdqu ymm10, YMMWORD PTR [r8+6688]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+544], ymm0
+ ; 144..151
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm6, YMMWORD PTR [r8+576]
+ vmovdqu ymm8, YMMWORD PTR [r8+1600]
+ vmovdqu ymm10, YMMWORD PTR [r8+2624]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4672]
+ vmovdqu ymm8, YMMWORD PTR [r8+3648]
+ vmovdqu ymm10, YMMWORD PTR [r8+4672]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5696]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6720]
+ vmovdqu ymm8, YMMWORD PTR [r8+5696]
+ vmovdqu ymm10, YMMWORD PTR [r8+6720]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm0
+ ; 152..159
+ vmovdqu ymm0, YMMWORD PTR [rdx+608]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1632]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2656]
+ vmovdqu ymm6, YMMWORD PTR [r8+608]
+ vmovdqu ymm8, YMMWORD PTR [r8+1632]
+ vmovdqu ymm10, YMMWORD PTR [r8+2656]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3680]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4704]
+ vmovdqu ymm8, YMMWORD PTR [r8+3680]
+ vmovdqu ymm10, YMMWORD PTR [r8+4704]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5728]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6752]
+ vmovdqu ymm8, YMMWORD PTR [r8+5728]
+ vmovdqu ymm10, YMMWORD PTR [r8+6752]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+608], ymm0
+ ; 160..167
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm6, YMMWORD PTR [r8+640]
+ vmovdqu ymm8, YMMWORD PTR [r8+1664]
+ vmovdqu ymm10, YMMWORD PTR [r8+2688]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4736]
+ vmovdqu ymm8, YMMWORD PTR [r8+3712]
+ vmovdqu ymm10, YMMWORD PTR [r8+4736]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5760]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6784]
+ vmovdqu ymm8, YMMWORD PTR [r8+5760]
+ vmovdqu ymm10, YMMWORD PTR [r8+6784]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+640], ymm0
+ ; 168..175
+ vmovdqu ymm0, YMMWORD PTR [rdx+672]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1696]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2720]
+ vmovdqu ymm6, YMMWORD PTR [r8+672]
+ vmovdqu ymm8, YMMWORD PTR [r8+1696]
+ vmovdqu ymm10, YMMWORD PTR [r8+2720]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3744]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4768]
+ vmovdqu ymm8, YMMWORD PTR [r8+3744]
+ vmovdqu ymm10, YMMWORD PTR [r8+4768]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5792]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6816]
+ vmovdqu ymm8, YMMWORD PTR [r8+5792]
+ vmovdqu ymm10, YMMWORD PTR [r8+6816]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+672], ymm0
+ ; 176..183
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm6, YMMWORD PTR [r8+704]
+ vmovdqu ymm8, YMMWORD PTR [r8+1728]
+ vmovdqu ymm10, YMMWORD PTR [r8+2752]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4800]
+ vmovdqu ymm8, YMMWORD PTR [r8+3776]
+ vmovdqu ymm10, YMMWORD PTR [r8+4800]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5824]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6848]
+ vmovdqu ymm8, YMMWORD PTR [r8+5824]
+ vmovdqu ymm10, YMMWORD PTR [r8+6848]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+704], ymm0
+ ; 184..191
+ vmovdqu ymm0, YMMWORD PTR [rdx+736]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1760]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2784]
+ vmovdqu ymm6, YMMWORD PTR [r8+736]
+ vmovdqu ymm8, YMMWORD PTR [r8+1760]
+ vmovdqu ymm10, YMMWORD PTR [r8+2784]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3808]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4832]
+ vmovdqu ymm8, YMMWORD PTR [r8+3808]
+ vmovdqu ymm10, YMMWORD PTR [r8+4832]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5856]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6880]
+ vmovdqu ymm8, YMMWORD PTR [r8+5856]
+ vmovdqu ymm10, YMMWORD PTR [r8+6880]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+736], ymm0
+ ; 192..199
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm6, YMMWORD PTR [r8+768]
+ vmovdqu ymm8, YMMWORD PTR [r8+1792]
+ vmovdqu ymm10, YMMWORD PTR [r8+2816]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4864]
+ vmovdqu ymm8, YMMWORD PTR [r8+3840]
+ vmovdqu ymm10, YMMWORD PTR [r8+4864]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5888]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6912]
+ vmovdqu ymm8, YMMWORD PTR [r8+5888]
+ vmovdqu ymm10, YMMWORD PTR [r8+6912]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ ; 200..207
+ vmovdqu ymm0, YMMWORD PTR [rdx+800]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1824]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2848]
+ vmovdqu ymm6, YMMWORD PTR [r8+800]
+ vmovdqu ymm8, YMMWORD PTR [r8+1824]
+ vmovdqu ymm10, YMMWORD PTR [r8+2848]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3872]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4896]
+ vmovdqu ymm8, YMMWORD PTR [r8+3872]
+ vmovdqu ymm10, YMMWORD PTR [r8+4896]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5920]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6944]
+ vmovdqu ymm8, YMMWORD PTR [r8+5920]
+ vmovdqu ymm10, YMMWORD PTR [r8+6944]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+800], ymm0
+ ; 208..215
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm6, YMMWORD PTR [r8+832]
+ vmovdqu ymm8, YMMWORD PTR [r8+1856]
+ vmovdqu ymm10, YMMWORD PTR [r8+2880]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4928]
+ vmovdqu ymm8, YMMWORD PTR [r8+3904]
+ vmovdqu ymm10, YMMWORD PTR [r8+4928]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5952]
+ vmovdqu ymm4, YMMWORD PTR [rdx+6976]
+ vmovdqu ymm8, YMMWORD PTR [r8+5952]
+ vmovdqu ymm10, YMMWORD PTR [r8+6976]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm0
+ ; 216..223
+ vmovdqu ymm0, YMMWORD PTR [rdx+864]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1888]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2912]
+ vmovdqu ymm6, YMMWORD PTR [r8+864]
+ vmovdqu ymm8, YMMWORD PTR [r8+1888]
+ vmovdqu ymm10, YMMWORD PTR [r8+2912]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3936]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4960]
+ vmovdqu ymm8, YMMWORD PTR [r8+3936]
+ vmovdqu ymm10, YMMWORD PTR [r8+4960]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+5984]
+ vmovdqu ymm4, YMMWORD PTR [rdx+7008]
+ vmovdqu ymm8, YMMWORD PTR [r8+5984]
+ vmovdqu ymm10, YMMWORD PTR [r8+7008]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+864], ymm0
+ ; 224..231
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm6, YMMWORD PTR [r8+896]
+ vmovdqu ymm8, YMMWORD PTR [r8+1920]
+ vmovdqu ymm10, YMMWORD PTR [r8+2944]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm4, YMMWORD PTR [rdx+4992]
+ vmovdqu ymm8, YMMWORD PTR [r8+3968]
+ vmovdqu ymm10, YMMWORD PTR [r8+4992]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+6016]
+ vmovdqu ymm4, YMMWORD PTR [rdx+7040]
+ vmovdqu ymm8, YMMWORD PTR [r8+6016]
+ vmovdqu ymm10, YMMWORD PTR [r8+7040]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+896], ymm0
+ ; 232..239
+ vmovdqu ymm0, YMMWORD PTR [rdx+928]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1952]
+ vmovdqu ymm4, YMMWORD PTR [rdx+2976]
+ vmovdqu ymm6, YMMWORD PTR [r8+928]
+ vmovdqu ymm8, YMMWORD PTR [r8+1952]
+ vmovdqu ymm10, YMMWORD PTR [r8+2976]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4000]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5024]
+ vmovdqu ymm8, YMMWORD PTR [r8+4000]
+ vmovdqu ymm10, YMMWORD PTR [r8+5024]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+6048]
+ vmovdqu ymm4, YMMWORD PTR [rdx+7072]
+ vmovdqu ymm8, YMMWORD PTR [r8+6048]
+ vmovdqu ymm10, YMMWORD PTR [r8+7072]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+928], ymm0
+ ; 240..247
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm2, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm6, YMMWORD PTR [r8+960]
+ vmovdqu ymm8, YMMWORD PTR [r8+1984]
+ vmovdqu ymm10, YMMWORD PTR [r8+3008]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5056]
+ vmovdqu ymm8, YMMWORD PTR [r8+4032]
+ vmovdqu ymm10, YMMWORD PTR [r8+5056]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+6080]
+ vmovdqu ymm4, YMMWORD PTR [rdx+7104]
+ vmovdqu ymm8, YMMWORD PTR [r8+6080]
+ vmovdqu ymm10, YMMWORD PTR [r8+7104]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+960], ymm0
+ ; 248..255
+ vmovdqu ymm0, YMMWORD PTR [rdx+992]
+ vmovdqu ymm2, YMMWORD PTR [rdx+2016]
+ vmovdqu ymm4, YMMWORD PTR [rdx+3040]
+ vmovdqu ymm6, YMMWORD PTR [r8+992]
+ vmovdqu ymm8, YMMWORD PTR [r8+2016]
+ vmovdqu ymm10, YMMWORD PTR [r8+3040]
+ vpshufd ymm1, ymm0, 245
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm7, ymm6, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm0, ymm0, ymm6
+ vpmuldq ymm1, ymm1, ymm7
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+4064]
+ vmovdqu ymm4, YMMWORD PTR [rdx+5088]
+ vmovdqu ymm8, YMMWORD PTR [r8+4064]
+ vmovdqu ymm10, YMMWORD PTR [r8+5088]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ vmovdqu ymm2, YMMWORD PTR [rdx+6112]
+ vmovdqu ymm4, YMMWORD PTR [rdx+7136]
+ vmovdqu ymm8, YMMWORD PTR [r8+6112]
+ vmovdqu ymm10, YMMWORD PTR [r8+7136]
+ vpshufd ymm3, ymm2, 245
+ vpshufd ymm5, ymm4, 245
+ vpshufd ymm9, ymm8, 245
+ vpshufd ymm11, ymm10, 245
+ vpmuldq ymm2, ymm2, ymm8
+ vpmuldq ymm3, ymm3, ymm9
+ vpmuldq ymm4, ymm4, ymm10
+ vpmuldq ymm5, ymm5, ymm11
+ vpaddq ymm0, ymm0, ymm2
+ vpaddq ymm1, ymm1, ymm3
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm5
+ ; Mont Reduce 2
+ vpmulld ymm6, ymm0, ymm13
+ vpmulld ymm7, ymm1, ymm13
+ vpmuldq ymm6, ymm6, ymm12
+ vpmuldq ymm7, ymm7, ymm12
+ vpsubd ymm0, ymm0, ymm6
+ vpsubd ymm1, ymm1, ymm7
+ vpsrlq ymm0, ymm0, 32
+ vpor ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+992], ymm0
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+wc_mldsa_mul_vec_7_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_rej_idx QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000003h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000000000004h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000000000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000000000004h, 0000000000000000h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000003h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000500000004h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000000000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000500000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000000000005h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000500000004h, 0000000000000000h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000003h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000600000004h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000600000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000600000004h, 0000000000000000h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000000h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000001h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000002h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000002h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000500000003h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000003h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000500000004h, 0000000000000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000600000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000500000004h
+ QWORD 0000000000000006h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000500000004h, 0000000000000006h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000700000000h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000700000001h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000700000002h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000700000002h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000700000003h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000700000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000700000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000700000003h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000700000004h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000700000004h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000700000004h, 0000000000000000h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000001h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000002h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000002h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000500000003h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000500000004h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000700000005h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000500000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000700000005h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000500000004h, 0000000000000007h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000000h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000001h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000600000002h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000002h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000600000003h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000003h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000600000004h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000400000003h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000600000004h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000600000004h, 0000000000000007h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000000h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000500000001h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000500000002h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000002h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000500000003h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000300000002h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000003h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000500000004h, 0000000700000006h
+ QWORD 0000000000000000h, 0000000000000000h
+ QWORD 0000000400000000h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000400000001h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000400000002h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000200000001h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000002h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000400000003h, 0000000600000005h
+ QWORD 0000000000000007h, 0000000000000000h
+ QWORD 0000000300000000h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000300000001h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000100000000h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000300000002h, 0000000500000004h
+ QWORD 0000000700000006h, 0000000000000000h
+ QWORD 0000000200000000h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000200000001h, 0000000400000003h
+ QWORD 0000000600000005h, 0000000000000007h
+ QWORD 0000000100000000h, 0000000300000002h
+ QWORD 0000000500000004h, 0000000700000006h
+ptr_L_mldsa_rej_idx QWORD L_mldsa_rej_idx
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_rej_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ptr_L_mldsa_rej_q QWORD L_mldsa_rej_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_rej_mask QWORD 007fffff007fffffh, 007fffff007fffffh
+ QWORD 007fffff007fffffh, 007fffff007fffffh
+ptr_L_mldsa_rej_mask QWORD L_mldsa_rej_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_rej_shuffle QWORD 0005040300020100h, 000b0a0900080706h
+ QWORD 0009080700060504h, 000f0e0d000c0b0ah
+ptr_L_mldsa_rej_shuffle QWORD L_mldsa_rej_shuffle
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_rej_ones QWORD 0101010101010101h, 0101010101010101h
+ QWORD 0101010101010101h, 0101010101010101h
+ptr_L_mldsa_rej_ones QWORD L_mldsa_rej_ones
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_rej_uniform_n_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ mov r10, r9
+ mov r9, r8
+ mov r8, rdx
+ mov rdx, rcx
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ mov eax, r8d
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_rej_q
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_rej_mask
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_rej_shuffle
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_rej_ones
+ mov r11, QWORD PTR [ptr_L_mldsa_rej_idx]
+ vpermq ymm0, [r9], 148
+ vpermq ymm1, [r9+24], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+48], 148
+ vpermq ymm1, [r9+72], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+96], 148
+ vpermq ymm1, [r9+120], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+144], 148
+ vpermq ymm1, [r9+168], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+192], 148
+ vpermq ymm1, [r9+216], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+240], 148
+ vpermq ymm1, [r9+264], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+288], 148
+ vpermq ymm1, [r9+312], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+336], 148
+ vpermq ymm1, [r9+360], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+384], 148
+ vpermq ymm1, [r9+408], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+432], 148
+ vpermq ymm1, [r9+456], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+480], 148
+ vpermq ymm1, [r9+504], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+528], 148
+ vpermq ymm1, [r9+552], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+576], 148
+ vpermq ymm1, [r9+600], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+624], 148
+ vpermq ymm1, [r9+648], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ vpermq ymm0, [r9+672], 148
+ vpermq ymm1, [r9+696], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ add r9, 720
+ sub r10d, 720
+L_mldsa_rej_uniform_n_avx2_start_256:
+ vpermq ymm0, [r9], 148
+ vpermq ymm1, [r9+24], 148
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpand ymm0, ymm0, ymm7
+ vpand ymm1, ymm1, ymm7
+ vpcmpgtd ymm2, ymm6, ymm0
+ vpcmpgtd ymm3, ymm6, ymm1
+ vpackssdw ymm2, ymm2, ymm3
+ vpermq ymm2, ymm2, 216
+ vpacksswb ymm2, ymm2, ymm2
+ vpermq ymm2, ymm2, 216
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ shl r12d, 5
+ shl ecx, 5
+ vmovdqu ymm2, YMMWORD PTR [r11+r12]
+ vmovdqu ymm3, YMMWORD PTR [r11+rcx]
+ vpermd ymm0, ymm2, ymm0
+ vpermd ymm1, ymm3, ymm1
+ popcnt r12d, r12d
+ popcnt ecx, ecx
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ lea rdx, QWORD PTR [rdx+4*r12]
+ sub r8d, r12d
+ vmovdqu YMMWORD PTR [rdx], ymm1
+ lea rdx, QWORD PTR [rdx+4*rcx]
+ sub r8d, ecx
+ add r9, 48
+ sub r10d, 48
+ cmp r10d, 48
+ jl L_mldsa_rej_uniform_n_avx2_done_256
+ cmp r8d, 16
+ jge L_mldsa_rej_uniform_n_avx2_start_256
+L_mldsa_rej_uniform_n_avx2_done_256:
+ cmp r8d, 0
+ je L_mldsa_rej_uniform_n_avx2_done_64
+ mov rdi, 72057589759737855
+ mov r15, 36028792732385279
+L_mldsa_rej_uniform_n_avx2_start_64:
+ mov rcx, QWORD PTR [r9]
+ pdep rcx, rcx, rdi
+ and rcx, r15
+ cmp ecx, 8380417
+ jge L_mldsa_rej_uniform_0_avx2_rej_large_0
+ mov DWORD PTR [rdx], ecx
+ add rdx, 4
+ sub r8d, 1
+ je L_mldsa_rej_uniform_n_avx2_done_64
+L_mldsa_rej_uniform_0_avx2_rej_large_0:
+ shr rcx, 32
+ cmp ecx, 8380417
+ jge L_mldsa_rej_uniform_0_avx2_rej_large_1
+ mov DWORD PTR [rdx], ecx
+ add rdx, 4
+ sub r8d, 1
+ je L_mldsa_rej_uniform_n_avx2_done_64
+L_mldsa_rej_uniform_0_avx2_rej_large_1:
+ add r9, 6
+ sub r10d, 6
+ jle L_mldsa_rej_uniform_n_avx2_done_64
+ cmp r8d, 0
+ jg L_mldsa_rej_uniform_n_avx2_start_64
+L_mldsa_rej_uniform_n_avx2_done_64:
+ vzeroupper
+ sub eax, r8d
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+wc_mldsa_rej_uniform_n_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_rej_uniform_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ mov r10, r9
+ mov r9, r8
+ mov r8, rdx
+ mov rdx, rcx
+ mov eax, r8d
+ mov rdi, 72057589759737855
+ mov r15, 36028792732385279
+L_mldsa_rej_uniform_avx2_start_64:
+ mov rcx, QWORD PTR [r9]
+ pdep rcx, rcx, rdi
+ and rcx, r15
+ cmp ecx, 8380417
+ jge L_mldsa_rej_uniform_avx2_rej_large_0
+ mov DWORD PTR [rdx], ecx
+ add rdx, 4
+ sub r8d, 1
+ je L_mldsa_rej_uniform_avx2_done_64
+L_mldsa_rej_uniform_avx2_rej_large_0:
+ shr rcx, 32
+ cmp ecx, 8380417
+ jge L_mldsa_rej_uniform_avx2_rej_large_1
+ mov DWORD PTR [rdx], ecx
+ add rdx, 4
+ sub r8d, 1
+ je L_mldsa_rej_uniform_avx2_done_64
+L_mldsa_rej_uniform_avx2_rej_large_1:
+ add r9, 6
+ sub r10d, 6
+ jle L_mldsa_rej_uniform_avx2_done_64
+L_mldsa_rej_uniform_avx2_done_64:
+ sub eax, r8d
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+wc_mldsa_rej_uniform_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_shufb_rej_idx QWORD 0ffffffffffffffffh, 0ffffffffffffffffh
+ QWORD 0ffffffffffff0100h, 0ffffffffffffffffh
+ QWORD 0ffffffffffff0302h, 0ffffffffffffffffh
+ QWORD 0ffffffff03020100h, 0ffffffffffffffffh
+ QWORD 0ffffffffffff0504h, 0ffffffffffffffffh
+ QWORD 0ffffffff05040100h, 0ffffffffffffffffh
+ QWORD 0ffffffff05040302h, 0ffffffffffffffffh
+ QWORD 0ffff050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffffffff0706h, 0ffffffffffffffffh
+ QWORD 0ffffffff07060100h, 0ffffffffffffffffh
+ QWORD 0ffffffff07060302h, 0ffffffffffffffffh
+ QWORD 0ffff070603020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff07060504h, 0ffffffffffffffffh
+ QWORD 0ffff070605040100h, 0ffffffffffffffffh
+ QWORD 0ffff070605040302h, 0ffffffffffffffffh
+ QWORD 0706050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffffffff0908h, 0ffffffffffffffffh
+ QWORD 0ffffffff09080100h, 0ffffffffffffffffh
+ QWORD 0ffffffff09080302h, 0ffffffffffffffffh
+ QWORD 0ffff090803020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff09080504h, 0ffffffffffffffffh
+ QWORD 0ffff090805040100h, 0ffffffffffffffffh
+ QWORD 0ffff090805040302h, 0ffffffffffffffffh
+ QWORD 0908050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff09080706h, 0ffffffffffffffffh
+ QWORD 0ffff090807060100h, 0ffffffffffffffffh
+ QWORD 0ffff090807060302h, 0ffffffffffffffffh
+ QWORD 0908070603020100h, 0ffffffffffffffffh
+ QWORD 0ffff090807060504h, 0ffffffffffffffffh
+ QWORD 0908070605040100h, 0ffffffffffffffffh
+ QWORD 0908070605040302h, 0ffffffffffffffffh
+ QWORD 0706050403020100h, 0ffffffffffff0908h
+ QWORD 0ffffffffffff0b0ah, 0ffffffffffffffffh
+ QWORD 0ffffffff0b0a0100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0b0a0302h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a03020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0b0a0504h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a05040100h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a05040302h, 0ffffffffffffffffh
+ QWORD 0b0a050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0b0a0706h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a07060100h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a07060302h, 0ffffffffffffffffh
+ QWORD 0b0a070603020100h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a07060504h, 0ffffffffffffffffh
+ QWORD 0b0a070605040100h, 0ffffffffffffffffh
+ QWORD 0b0a070605040302h, 0ffffffffffffffffh
+ QWORD 0706050403020100h, 0ffffffffffff0b0ah
+ QWORD 0ffffffff0b0a0908h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a09080100h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a09080302h, 0ffffffffffffffffh
+ QWORD 0b0a090803020100h, 0ffffffffffffffffh
+ QWORD 0ffff0b0a09080504h, 0ffffffffffffffffh
+ QWORD 0b0a090805040100h, 0ffffffffffffffffh
+ QWORD 0b0a090805040302h, 0ffffffffffffffffh
+ QWORD 0908050403020100h, 0ffffffffffff0b0ah
+ QWORD 0ffff0b0a09080706h, 0ffffffffffffffffh
+ QWORD 0b0a090807060100h, 0ffffffffffffffffh
+ QWORD 0b0a090807060302h, 0ffffffffffffffffh
+ QWORD 0908070603020100h, 0ffffffffffff0b0ah
+ QWORD 0b0a090807060504h, 0ffffffffffffffffh
+ QWORD 0908070605040100h, 0ffffffffffff0b0ah
+ QWORD 0908070605040302h, 0ffffffffffff0b0ah
+ QWORD 0706050403020100h, 0ffffffff0b0a0908h
+ QWORD 0ffffffffffff0d0ch, 0ffffffffffffffffh
+ QWORD 0ffffffff0d0c0100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0d0c0302h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c03020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0d0c0504h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c05040100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c05040302h, 0ffffffffffffffffh
+ QWORD 0d0c050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0d0c0706h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c07060100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c07060302h, 0ffffffffffffffffh
+ QWORD 0d0c070603020100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c07060504h, 0ffffffffffffffffh
+ QWORD 0d0c070605040100h, 0ffffffffffffffffh
+ QWORD 0d0c070605040302h, 0ffffffffffffffffh
+ QWORD 0706050403020100h, 0ffffffffffff0d0ch
+ QWORD 0ffffffff0d0c0908h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c09080100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c09080302h, 0ffffffffffffffffh
+ QWORD 0d0c090803020100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c09080504h, 0ffffffffffffffffh
+ QWORD 0d0c090805040100h, 0ffffffffffffffffh
+ QWORD 0d0c090805040302h, 0ffffffffffffffffh
+ QWORD 0908050403020100h, 0ffffffffffff0d0ch
+ QWORD 0ffff0d0c09080706h, 0ffffffffffffffffh
+ QWORD 0d0c090807060100h, 0ffffffffffffffffh
+ QWORD 0d0c090807060302h, 0ffffffffffffffffh
+ QWORD 0908070603020100h, 0ffffffffffff0d0ch
+ QWORD 0d0c090807060504h, 0ffffffffffffffffh
+ QWORD 0908070605040100h, 0ffffffffffff0d0ch
+ QWORD 0908070605040302h, 0ffffffffffff0d0ch
+ QWORD 0706050403020100h, 0ffffffff0d0c0908h
+ QWORD 0ffffffff0d0c0b0ah, 0ffffffffffffffffh
+ QWORD 0ffff0d0c0b0a0100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c0b0a0302h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a03020100h, 0ffffffffffffffffh
+ QWORD 0ffff0d0c0b0a0504h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a05040100h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a05040302h, 0ffffffffffffffffh
+ QWORD 0b0a050403020100h, 0ffffffffffff0d0ch
+ QWORD 0ffff0d0c0b0a0706h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a07060100h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a07060302h, 0ffffffffffffffffh
+ QWORD 0b0a070603020100h, 0ffffffffffff0d0ch
+ QWORD 0d0c0b0a07060504h, 0ffffffffffffffffh
+ QWORD 0b0a070605040100h, 0ffffffffffff0d0ch
+ QWORD 0b0a070605040302h, 0ffffffffffff0d0ch
+ QWORD 0706050403020100h, 0ffffffff0d0c0b0ah
+ QWORD 0ffff0d0c0b0a0908h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a09080100h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a09080302h, 0ffffffffffffffffh
+ QWORD 0b0a090803020100h, 0ffffffffffff0d0ch
+ QWORD 0d0c0b0a09080504h, 0ffffffffffffffffh
+ QWORD 0b0a090805040100h, 0ffffffffffff0d0ch
+ QWORD 0b0a090805040302h, 0ffffffffffff0d0ch
+ QWORD 0908050403020100h, 0ffffffff0d0c0b0ah
+ QWORD 0d0c0b0a09080706h, 0ffffffffffffffffh
+ QWORD 0b0a090807060100h, 0ffffffffffff0d0ch
+ QWORD 0b0a090807060302h, 0ffffffffffff0d0ch
+ QWORD 0908070603020100h, 0ffffffff0d0c0b0ah
+ QWORD 0b0a090807060504h, 0ffffffffffff0d0ch
+ QWORD 0908070605040100h, 0ffffffff0d0c0b0ah
+ QWORD 0908070605040302h, 0ffffffff0d0c0b0ah
+ QWORD 0706050403020100h, 0ffff0d0c0b0a0908h
+ QWORD 0ffffffffffff0f0eh, 0ffffffffffffffffh
+ QWORD 0ffffffff0f0e0100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0f0e0302h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e03020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0f0e0504h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e05040100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e05040302h, 0ffffffffffffffffh
+ QWORD 0f0e050403020100h, 0ffffffffffffffffh
+ QWORD 0ffffffff0f0e0706h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e07060100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e07060302h, 0ffffffffffffffffh
+ QWORD 0f0e070603020100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e07060504h, 0ffffffffffffffffh
+ QWORD 0f0e070605040100h, 0ffffffffffffffffh
+ QWORD 0f0e070605040302h, 0ffffffffffffffffh
+ QWORD 0706050403020100h, 0ffffffffffff0f0eh
+ QWORD 0ffffffff0f0e0908h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e09080100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e09080302h, 0ffffffffffffffffh
+ QWORD 0f0e090803020100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e09080504h, 0ffffffffffffffffh
+ QWORD 0f0e090805040100h, 0ffffffffffffffffh
+ QWORD 0f0e090805040302h, 0ffffffffffffffffh
+ QWORD 0908050403020100h, 0ffffffffffff0f0eh
+ QWORD 0ffff0f0e09080706h, 0ffffffffffffffffh
+ QWORD 0f0e090807060100h, 0ffffffffffffffffh
+ QWORD 0f0e090807060302h, 0ffffffffffffffffh
+ QWORD 0908070603020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e090807060504h, 0ffffffffffffffffh
+ QWORD 0908070605040100h, 0ffffffffffff0f0eh
+ QWORD 0908070605040302h, 0ffffffffffff0f0eh
+ QWORD 0706050403020100h, 0ffffffff0f0e0908h
+ QWORD 0ffffffff0f0e0b0ah, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0b0a0100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0b0a0302h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a03020100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0b0a0504h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a05040100h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a05040302h, 0ffffffffffffffffh
+ QWORD 0b0a050403020100h, 0ffffffffffff0f0eh
+ QWORD 0ffff0f0e0b0a0706h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a07060100h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a07060302h, 0ffffffffffffffffh
+ QWORD 0b0a070603020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e0b0a07060504h, 0ffffffffffffffffh
+ QWORD 0b0a070605040100h, 0ffffffffffff0f0eh
+ QWORD 0b0a070605040302h, 0ffffffffffff0f0eh
+ QWORD 0706050403020100h, 0ffffffff0f0e0b0ah
+ QWORD 0ffff0f0e0b0a0908h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a09080100h, 0ffffffffffffffffh
+ QWORD 0f0e0b0a09080302h, 0ffffffffffffffffh
+ QWORD 0b0a090803020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e0b0a09080504h, 0ffffffffffffffffh
+ QWORD 0b0a090805040100h, 0ffffffffffff0f0eh
+ QWORD 0b0a090805040302h, 0ffffffffffff0f0eh
+ QWORD 0908050403020100h, 0ffffffff0f0e0b0ah
+ QWORD 0f0e0b0a09080706h, 0ffffffffffffffffh
+ QWORD 0b0a090807060100h, 0ffffffffffff0f0eh
+ QWORD 0b0a090807060302h, 0ffffffffffff0f0eh
+ QWORD 0908070603020100h, 0ffffffff0f0e0b0ah
+ QWORD 0b0a090807060504h, 0ffffffffffff0f0eh
+ QWORD 0908070605040100h, 0ffffffff0f0e0b0ah
+ QWORD 0908070605040302h, 0ffffffff0f0e0b0ah
+ QWORD 0706050403020100h, 0ffff0f0e0b0a0908h
+ QWORD 0ffffffff0f0e0d0ch, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0d0c0100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0d0c0302h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c03020100h, 0ffffffffffffffffh
+ QWORD 0ffff0f0e0d0c0504h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c05040100h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c05040302h, 0ffffffffffffffffh
+ QWORD 0d0c050403020100h, 0ffffffffffff0f0eh
+ QWORD 0ffff0f0e0d0c0706h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c07060100h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c07060302h, 0ffffffffffffffffh
+ QWORD 0d0c070603020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e0d0c07060504h, 0ffffffffffffffffh
+ QWORD 0d0c070605040100h, 0ffffffffffff0f0eh
+ QWORD 0d0c070605040302h, 0ffffffffffff0f0eh
+ QWORD 0706050403020100h, 0ffffffff0f0e0d0ch
+ QWORD 0ffff0f0e0d0c0908h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c09080100h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c09080302h, 0ffffffffffffffffh
+ QWORD 0d0c090803020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e0d0c09080504h, 0ffffffffffffffffh
+ QWORD 0d0c090805040100h, 0ffffffffffff0f0eh
+ QWORD 0d0c090805040302h, 0ffffffffffff0f0eh
+ QWORD 0908050403020100h, 0ffffffff0f0e0d0ch
+ QWORD 0f0e0d0c09080706h, 0ffffffffffffffffh
+ QWORD 0d0c090807060100h, 0ffffffffffff0f0eh
+ QWORD 0d0c090807060302h, 0ffffffffffff0f0eh
+ QWORD 0908070603020100h, 0ffffffff0f0e0d0ch
+ QWORD 0d0c090807060504h, 0ffffffffffff0f0eh
+ QWORD 0908070605040100h, 0ffffffff0f0e0d0ch
+ QWORD 0908070605040302h, 0ffffffff0f0e0d0ch
+ QWORD 0706050403020100h, 0ffff0f0e0d0c0908h
+ QWORD 0ffff0f0e0d0c0b0ah, 0ffffffffffffffffh
+ QWORD 0f0e0d0c0b0a0100h, 0ffffffffffffffffh
+ QWORD 0f0e0d0c0b0a0302h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a03020100h, 0ffffffffffff0f0eh
+ QWORD 0f0e0d0c0b0a0504h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a05040100h, 0ffffffffffff0f0eh
+ QWORD 0d0c0b0a05040302h, 0ffffffffffff0f0eh
+ QWORD 0b0a050403020100h, 0ffffffff0f0e0d0ch
+ QWORD 0f0e0d0c0b0a0706h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a07060100h, 0ffffffffffff0f0eh
+ QWORD 0d0c0b0a07060302h, 0ffffffffffff0f0eh
+ QWORD 0b0a070603020100h, 0ffffffff0f0e0d0ch
+ QWORD 0d0c0b0a07060504h, 0ffffffffffff0f0eh
+ QWORD 0b0a070605040100h, 0ffffffff0f0e0d0ch
+ QWORD 0b0a070605040302h, 0ffffffff0f0e0d0ch
+ QWORD 0706050403020100h, 0ffff0f0e0d0c0b0ah
+ QWORD 0f0e0d0c0b0a0908h, 0ffffffffffffffffh
+ QWORD 0d0c0b0a09080100h, 0ffffffffffff0f0eh
+ QWORD 0d0c0b0a09080302h, 0ffffffffffff0f0eh
+ QWORD 0b0a090803020100h, 0ffffffff0f0e0d0ch
+ QWORD 0d0c0b0a09080504h, 0ffffffffffff0f0eh
+ QWORD 0b0a090805040100h, 0ffffffff0f0e0d0ch
+ QWORD 0b0a090805040302h, 0ffffffff0f0e0d0ch
+ QWORD 0908050403020100h, 0ffff0f0e0d0c0b0ah
+ QWORD 0d0c0b0a09080706h, 0ffffffffffff0f0eh
+ QWORD 0b0a090807060100h, 0ffffffff0f0e0d0ch
+ QWORD 0b0a090807060302h, 0ffffffff0f0e0d0ch
+ QWORD 0908070603020100h, 0ffff0f0e0d0c0b0ah
+ QWORD 0b0a090807060504h, 0ffffffff0f0e0d0ch
+ QWORD 0908070605040100h, 0ffff0f0e0d0c0b0ah
+ QWORD 0908070605040302h, 0ffff0f0e0d0c0b0ah
+ QWORD 0706050403020100h, 0f0e0d0c0b0a0908h
+ptr_L_mldsa_shufb_rej_idx QWORD L_mldsa_shufb_rej_idx
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta2_mask_nibbles WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ptr_L_mldsa_extract_coeffs_eta2_mask_nibbles QWORD L_mldsa_extract_coeffs_eta2_mask_nibbles
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta2_mul WORD 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h
+ WORD 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h
+ptr_L_mldsa_extract_coeffs_eta2_mul QWORD L_mldsa_extract_coeffs_eta2_mul
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta2_five WORD 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h
+ WORD 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h
+ptr_L_mldsa_extract_coeffs_eta2_five QWORD L_mldsa_extract_coeffs_eta2_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta2_two WORD 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h
+ WORD 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h
+ptr_L_mldsa_extract_coeffs_eta2_two QWORD L_mldsa_extract_coeffs_eta2_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta2_nibble_table DWORD 00000002h, 00000001h, 00000000h, 0ffffffffh
+ DWORD 0fffffffeh, 00000002h, 00000001h, 00000000h
+ DWORD 0ffffffffh, 0fffffffeh, 00000002h, 00000001h
+ DWORD 00000000h, 0ffffffffh, 0fffffffeh, 00000000h
+ptr_L_mldsa_extract_coeffs_eta2_nibble_table QWORD L_mldsa_extract_coeffs_eta2_nibble_table
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_extract_coeffs_eta2_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_extract_coeffs_eta2_mask_nibbles
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_extract_coeffs_eta2_mul
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_extract_coeffs_eta2_five
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_extract_coeffs_eta2_two
+ mov r15, QWORD PTR [ptr_L_mldsa_shufb_rej_idx]
+ mov r10d, DWORD PTR [r9]
+ cmp r10d, 0
+ jne L_mldsa_extract_coeffs_eta2_less_than_256
+ vpmovzxbd ymm0, QWORD PTR [rcx]
+ vpmovzxbd ymm1, QWORD PTR [rcx+8]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+16]
+ vpmovzxbd ymm1, QWORD PTR [rcx+24]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+32]
+ vpmovzxbd ymm1, QWORD PTR [rcx+40]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+48]
+ vpmovzxbd ymm1, QWORD PTR [rcx+56]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+64]
+ vpmovzxbd ymm1, QWORD PTR [rcx+72]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+80]
+ vpmovzxbd ymm1, QWORD PTR [rcx+88]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+96]
+ vpmovzxbd ymm1, QWORD PTR [rcx+104]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+112]
+ vpmovzxbd ymm1, QWORD PTR [rcx+120]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm3, YMMWORD PTR [r15+rbx]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vmovdqu ymm5, YMMWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmulhw ymm3, ymm1, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ sub edx, 128
+ add rcx, 128
+L_mldsa_extract_coeffs_eta2_less_than_256:
+ cmp r10d, 240
+ jg L_mldsa_extract_coeffs_eta2_less_than_ymm
+L_mldsa_extract_coeffs_eta2_start_one_ymm:
+ vpmovzxbd ymm0, QWORD PTR [rcx]
+ vpslld ymm2, ymm0, 12
+ vpor ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm6
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpacksswb ymm2, ymm2, ymm2
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ movzx r12d, al
+ shr r13, 16
+ and r13d, 255
+ shl r12d, 4
+ shl r13d, 4
+ vmovdqu ymm2, YMMWORD PTR [r15+r12]
+ vmovdqu ymm4, YMMWORD PTR [r15+r13]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vpshufb ymm0, ymm0, ymm2
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ vpmulhw ymm2, ymm0, ymm7
+ vpmullw ymm2, ymm2, ymm8
+ vpsubw ymm0, ymm9, ymm0
+ vpaddw ymm0, ymm0, ymm2
+ vpmovsxwd ymm2, xmm0
+ vextracti128 xmm0, ymm0, 1
+ vpmovsxwd ymm0, xmm0
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ sub edx, 8
+ add rcx, 8
+ cmp edx, 8
+ jl L_mldsa_extract_coeffs_eta2_less_than_ymm
+ cmp r10d, 240
+ jle L_mldsa_extract_coeffs_eta2_start_one_ymm
+L_mldsa_extract_coeffs_eta2_less_than_ymm:
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta2_done
+L_mldsa_extract_coeffs_eta2_start_byte:
+ mov r15, QWORD PTR [ptr_L_mldsa_extract_coeffs_eta2_nibble_table]
+ cmp edx, 0
+ je L_mldsa_extract_coeffs_eta2_done
+ movzx ebx, BYTE PTR [rcx]
+ add rcx, 1
+ sub edx, 1
+ mov eax, ebx
+ shr eax, 4
+ and bl, 15
+ xor r13, r13
+ cmp bl, 15
+ adc r13d, 0
+ mov r14d, DWORD PTR [r15+4*rbx]
+ mov DWORD PTR [r8], r14d
+ add r10d, r13d
+ shl r13d, 2
+ add r8, r13
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta2_done
+ xor r13, r13
+ cmp al, 15
+ adc r13d, 0
+ mov r14d, DWORD PTR [r15+4*rax]
+ mov DWORD PTR [r8], r14d
+ add r10d, r13d
+ shl r13d, 2
+ add r8, r13
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta2_done
+ jmp L_mldsa_extract_coeffs_eta2_start_byte
+L_mldsa_extract_coeffs_eta2_done:
+ mov DWORD PTR [r9], r10d
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+wc_mldsa_extract_coeffs_eta2_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta4_mask_nibbles WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ptr_L_mldsa_extract_coeffs_eta4_mask_nibbles QWORD L_mldsa_extract_coeffs_eta4_mask_nibbles
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta4_nine WORD 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h
+ WORD 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h
+ptr_L_mldsa_extract_coeffs_eta4_nine QWORD L_mldsa_extract_coeffs_eta4_nine
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_extract_coeffs_eta4_four WORD 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h
+ WORD 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h
+ptr_L_mldsa_extract_coeffs_eta4_four QWORD L_mldsa_extract_coeffs_eta4_four
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_extract_coeffs_eta4_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 48
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_extract_coeffs_eta4_mask_nibbles
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_extract_coeffs_eta4_nine
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_extract_coeffs_eta4_four
+ mov r15, QWORD PTR [ptr_L_mldsa_shufb_rej_idx]
+ mov r10d, DWORD PTR [r9]
+ cmp r10d, 0
+ jne L_mldsa_extract_coeffs_eta4_less_than_256
+ vpmovzxbd ymm0, QWORD PTR [rcx]
+ vpmovzxbd ymm1, QWORD PTR [rcx+8]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+16]
+ vpmovzxbd ymm1, QWORD PTR [rcx+24]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+32]
+ vpmovzxbd ymm1, QWORD PTR [rcx+40]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+48]
+ vpmovzxbd ymm1, QWORD PTR [rcx+56]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+64]
+ vpmovzxbd ymm1, QWORD PTR [rcx+72]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+80]
+ vpmovzxbd ymm1, QWORD PTR [rcx+88]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+96]
+ vpmovzxbd ymm1, QWORD PTR [rcx+104]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ vpmovzxbd ymm0, QWORD PTR [rcx+112]
+ vpmovzxbd ymm1, QWORD PTR [rcx+120]
+ vpslld ymm2, ymm0, 12
+ vpslld ymm3, ymm1, 12
+ vpor ymm0, ymm0, ymm2
+ vpor ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm6
+ vpand ymm1, ymm1, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpcmpgtw ymm3, ymm7, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ mov r14d, eax
+ movzx r12d, al
+ movzx ebx, ah
+ shr r13, 16
+ shr r14, 24
+ and r13d, 255
+ shl r12d, 4
+ shl ebx, 4
+ shl r13d, 4
+ shl r14d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm3, OWORD PTR [r15+rbx]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vmovdqu xmm5, OWORD PTR [r15+r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ popcnt r12d, r12d
+ popcnt ebx, ebx
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm8, ymm1
+ vpmovsxwd ymm2, xmm0
+ vpmovsxwd ymm3, xmm1
+ vextracti128 xmm0, ymm0, 1
+ vextracti128 xmm1, ymm1, 1
+ vpmovsxwd ymm0, xmm0
+ vpmovsxwd ymm1, xmm1
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ vmovdqu YMMWORD PTR [r8], ymm3
+ lea r8, QWORD PTR [r8+4*rbx]
+ add r10d, ebx
+ vmovdqu YMMWORD PTR [r8], ymm1
+ lea r8, QWORD PTR [r8+4*r14]
+ add r10d, r14d
+ sub edx, 128
+ add rcx, 128
+L_mldsa_extract_coeffs_eta4_less_than_256:
+ cmp r10d, 240
+ jg L_mldsa_extract_coeffs_eta4_less_than_ymm
+L_mldsa_extract_coeffs_eta4_start_one_ymm:
+ vpmovzxbd ymm0, QWORD PTR [rcx]
+ vpslld ymm2, ymm0, 12
+ vpor ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm6
+ vpcmpgtw ymm2, ymm7, ymm0
+ vpacksswb ymm2, ymm2, ymm2
+ vpmovmskb eax, ymm2
+ mov r13d, eax
+ movzx r12d, al
+ shr r13, 16
+ and r13d, 255
+ shl r12d, 4
+ shl r13d, 4
+ vmovdqu xmm2, OWORD PTR [r15+r12]
+ vmovdqu xmm4, OWORD PTR [r15+r13]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vpshufb ymm0, ymm0, ymm2
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ vpsubw ymm0, ymm8, ymm0
+ vpmovsxwd ymm2, xmm0
+ vextracti128 xmm0, ymm0, 1
+ vpmovsxwd ymm0, xmm0
+ vmovdqu YMMWORD PTR [r8], ymm2
+ lea r8, QWORD PTR [r8+4*r12]
+ add r10d, r12d
+ vmovdqu YMMWORD PTR [r8], ymm0
+ lea r8, QWORD PTR [r8+4*r13]
+ add r10d, r13d
+ sub edx, 8
+ add rcx, 8
+ cmp edx, 8
+ jl L_mldsa_extract_coeffs_eta4_less_than_ymm
+ cmp r10d, 240
+ jle L_mldsa_extract_coeffs_eta4_start_one_ymm
+L_mldsa_extract_coeffs_eta4_less_than_ymm:
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta4_done
+L_mldsa_extract_coeffs_eta4_start_byte:
+ cmp edx, 0
+ je L_mldsa_extract_coeffs_eta4_done
+ movzx ebx, BYTE PTR [rcx]
+ add rcx, 1
+ sub edx, 1
+ mov eax, ebx
+ shr eax, 4
+ and bl, 15
+ xor r13, r13
+ mov r14, 4
+ cmp bl, 9
+ adc r13d, 0
+ sub r14d, ebx
+ mov DWORD PTR [r8], r14d
+ add r10d, r13d
+ shl r13d, 2
+ add r8, r13
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta4_done
+ xor r13, r13
+ mov r14, 4
+ cmp al, 9
+ adc r13d, 0
+ sub r14d, eax
+ mov DWORD PTR [r8], r14d
+ add r10d, r13d
+ shl r13d, 2
+ add r8, r13
+ cmp r10d, 256
+ je L_mldsa_extract_coeffs_eta4_done
+ jmp L_mldsa_extract_coeffs_eta4_start_byte
+L_mldsa_extract_coeffs_eta4_done:
+ mov DWORD PTR [r9], r10d
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+wc_mldsa_extract_coeffs_eta4_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_redistribute_21_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vmovdqu ymm8, YMMWORD PTR [rcx+256]
+ vmovdqu ymm9, YMMWORD PTR [rcx+288]
+ vmovdqu ymm10, YMMWORD PTR [rcx+320]
+ vmovdqu ymm11, YMMWORD PTR [rcx+352]
+ vpunpcklqdq ymm12, ymm0, ymm1
+ vpunpckhqdq ymm13, ymm0, ymm1
+ vpunpcklqdq ymm14, ymm2, ymm3
+ vpunpckhqdq ymm15, ymm2, ymm3
+ vperm2i128 ymm0, ymm12, ymm14, 32
+ vperm2i128 ymm1, ymm13, ymm15, 32
+ vperm2i128 ymm2, ymm12, ymm14, 49
+ vperm2i128 ymm3, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm4, ymm5
+ vpunpckhqdq ymm13, ymm4, ymm5
+ vpunpcklqdq ymm14, ymm6, ymm7
+ vpunpckhqdq ymm15, ymm6, ymm7
+ vperm2i128 ymm4, ymm12, ymm14, 32
+ vperm2i128 ymm5, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm7, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm8, ymm9
+ vpunpckhqdq ymm13, ymm8, ymm9
+ vpunpcklqdq ymm14, ymm10, ymm11
+ vpunpckhqdq ymm15, ymm10, ymm11
+ vperm2i128 ymm8, ymm12, ymm14, 32
+ vperm2i128 ymm9, ymm13, ymm15, 32
+ vperm2i128 ymm10, ymm12, ymm14, 49
+ vperm2i128 ymm11, ymm13, ymm15, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [rdx+64], ymm8
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r8+64], ymm9
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [r9+64], ymm10
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vmovdqu YMMWORD PTR [rax+64], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+544]
+ vmovdqu ymm6, YMMWORD PTR [rcx+576]
+ vmovdqu ymm7, YMMWORD PTR [rcx+608]
+ mov r10, QWORD PTR [rcx+640]
+ mov r11, QWORD PTR [rcx+648]
+ mov r12, QWORD PTR [rcx+656]
+ mov r13, QWORD PTR [rcx+664]
+ vpunpcklqdq ymm12, ymm0, ymm1
+ vpunpckhqdq ymm13, ymm0, ymm1
+ vpunpcklqdq ymm14, ymm2, ymm3
+ vpunpckhqdq ymm15, ymm2, ymm3
+ vperm2i128 ymm0, ymm12, ymm14, 32
+ vperm2i128 ymm1, ymm13, ymm15, 32
+ vperm2i128 ymm2, ymm12, ymm14, 49
+ vperm2i128 ymm3, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm4, ymm5
+ vpunpckhqdq ymm13, ymm4, ymm5
+ vpunpcklqdq ymm14, ymm6, ymm7
+ vpunpckhqdq ymm15, ymm6, ymm7
+ vperm2i128 ymm4, ymm12, ymm14, 32
+ vperm2i128 ymm5, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm7, ymm13, ymm15, 49
+ vmovdqu YMMWORD PTR [rdx+96], ymm0
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ mov QWORD PTR [rdx+160], r10
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm5
+ mov QWORD PTR [r8+160], r11
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm6
+ mov QWORD PTR [r9+160], r12
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm7
+ mov QWORD PTR [rax+160], r13
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r13
+ pop r12
+ ret
+wc_mldsa_redistribute_21_rand_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_redistribute_17_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ mov r10, QWORD PTR [rcx+512]
+ mov r11, QWORD PTR [rcx+520]
+ mov r12, QWORD PTR [rcx+528]
+ mov r13, QWORD PTR [rcx+536]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm4
+ mov QWORD PTR [rdx+128], r10
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm5
+ mov QWORD PTR [r8+128], r11
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ mov QWORD PTR [r9+128], r12
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm7
+ mov QWORD PTR [rax+128], r13
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop r13
+ pop r12
+ ret
+wc_mldsa_redistribute_17_rand_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_two DWORD 00000002h, 00000002h, 00000002h, 00000002h
+ DWORD 00000002h, 00000002h, 00000002h, 00000002h
+ptr_L_mldsa_encode_eta_2_avx2_two QWORD L_mldsa_encode_eta_2_avx2_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_vs_3 DWORD 00000000h, 00000003h, 00000006h, 00000009h
+ DWORD 00000004h, 00000007h, 0000000ah, 0000000dh
+ptr_L_mldsa_encode_eta_2_avx2_vs_3 QWORD L_mldsa_encode_eta_2_avx2_vs_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_shuff_3_even BYTE 00h, 0ffh, 04h, 05h, 08h, 0ffh, 0ch, 0dh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 00h, 0ffh, 04h, 05h, 08h, 0ffh, 0ch, 0dh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_eta_2_avx2_shuff_3_even QWORD L_mldsa_encode_eta_2_avx2_shuff_3_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_shuff_3_odd BYTE 02h, 0ffh, 0ffh, 07h, 0ah, 0bh, 0ffh, 0fh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 02h, 0ffh, 0ffh, 07h, 0ah, 0bh, 0ffh, 0fh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_eta_2_avx2_shuff_3_odd QWORD L_mldsa_encode_eta_2_avx2_shuff_3_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_shuff_6_even BYTE 00h, 04h, 05h, 08h, 0ch, 0dh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 04h
+ BYTE 05h, 08h, 0ch, 0dh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_eta_2_avx2_shuff_6_even QWORD L_mldsa_encode_eta_2_avx2_shuff_6_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_2_avx2_shuff_6_odd BYTE 02h, 03h, 07h, 0ah, 0bh, 0fh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 02h, 03h
+ BYTE 07h, 0ah, 0bh, 0fh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_eta_2_avx2_shuff_6_odd QWORD L_mldsa_encode_eta_2_avx2_shuff_6_odd
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_vec_encode_eta_2_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_eta_2_avx2_two
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_eta_2_avx2_vs_3
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_3_even
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_3_odd
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_6_even
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_6_odd
+L_mldsa_encode_eta_2_avx2_loop:
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpsubd ymm0, ymm6, ymm0
+ vpsubd ymm1, ymm6, ymm1
+ vpsubd ymm2, ymm6, ymm2
+ vpsubd ymm3, ymm6, ymm3
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpackusdw ymm0, ymm0, ymm2
+ vpackusdw ymm1, ymm1, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpshufb ymm4, ymm0, ymm9
+ vpshufb ymm5, ymm1, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vpshufb ymm1, ymm1, ymm8
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpermq ymm1, ymm1, 177
+ vpor ymm0, ymm0, ymm1
+ vpshufb ymm4, ymm0, ymm11
+ vpshufb ymm0, ymm0, ymm10
+ vpor ymm0, ymm0, ymm4
+ vextracti128 xmm4, ymm0, 1
+ vpor ymm0, ymm0, ymm4
+ vmovdqu OWORD PTR [r8], xmm0
+ add r8, 12
+ add rcx, 1024
+ dec dl
+ jnz L_mldsa_encode_eta_2_avx2_loop
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+wc_mldsa_vec_encode_eta_2_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_4_avx2_four DWORD 00000004h, 00000004h, 00000004h, 00000004h
+ DWORD 00000004h, 00000004h, 00000004h, 00000004h
+ptr_L_mldsa_encode_eta_4_avx2_four QWORD L_mldsa_encode_eta_4_avx2_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_eta_4_avx2_vs_4 DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ptr_L_mldsa_encode_eta_4_avx2_vs_4 QWORD L_mldsa_encode_eta_4_avx2_vs_4
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_vec_encode_eta_4_avx2 PROC
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_eta_4_avx2_four
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_eta_4_avx2_vs_4
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+32], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+96], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+1024]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1056]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1088]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1120]
+ vmovdqu ymm4, YMMWORD PTR [rcx+1152]
+ vmovdqu ymm5, YMMWORD PTR [rcx+1184]
+ vmovdqu ymm6, YMMWORD PTR [rcx+1216]
+ vmovdqu ymm7, YMMWORD PTR [rcx+1248]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+1280]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1312]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1344]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1376]
+ vmovdqu ymm4, YMMWORD PTR [rcx+1408]
+ vmovdqu ymm5, YMMWORD PTR [rcx+1440]
+ vmovdqu ymm6, YMMWORD PTR [rcx+1472]
+ vmovdqu ymm7, YMMWORD PTR [rcx+1504]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+160], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+1536]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1568]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1600]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1632]
+ vmovdqu ymm4, YMMWORD PTR [rcx+1664]
+ vmovdqu ymm5, YMMWORD PTR [rcx+1696]
+ vmovdqu ymm6, YMMWORD PTR [rcx+1728]
+ vmovdqu ymm7, YMMWORD PTR [rcx+1760]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+1792]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1824]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1856]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1888]
+ vmovdqu ymm4, YMMWORD PTR [rcx+1920]
+ vmovdqu ymm5, YMMWORD PTR [rcx+1952]
+ vmovdqu ymm6, YMMWORD PTR [rcx+1984]
+ vmovdqu ymm7, YMMWORD PTR [rcx+2016]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+224], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+2048]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2080]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2112]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2144]
+ vmovdqu ymm4, YMMWORD PTR [rcx+2176]
+ vmovdqu ymm5, YMMWORD PTR [rcx+2208]
+ vmovdqu ymm6, YMMWORD PTR [rcx+2240]
+ vmovdqu ymm7, YMMWORD PTR [rcx+2272]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+2304]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2336]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2368]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2400]
+ vmovdqu ymm4, YMMWORD PTR [rcx+2432]
+ vmovdqu ymm5, YMMWORD PTR [rcx+2464]
+ vmovdqu ymm6, YMMWORD PTR [rcx+2496]
+ vmovdqu ymm7, YMMWORD PTR [rcx+2528]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+288], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+2560]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2592]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2624]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2656]
+ vmovdqu ymm4, YMMWORD PTR [rcx+2688]
+ vmovdqu ymm5, YMMWORD PTR [rcx+2720]
+ vmovdqu ymm6, YMMWORD PTR [rcx+2752]
+ vmovdqu ymm7, YMMWORD PTR [rcx+2784]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+2816]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2848]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2880]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2912]
+ vmovdqu ymm4, YMMWORD PTR [rcx+2944]
+ vmovdqu ymm5, YMMWORD PTR [rcx+2976]
+ vmovdqu ymm6, YMMWORD PTR [rcx+3008]
+ vmovdqu ymm7, YMMWORD PTR [rcx+3040]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+352], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+3072]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3104]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3136]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3168]
+ vmovdqu ymm4, YMMWORD PTR [rcx+3200]
+ vmovdqu ymm5, YMMWORD PTR [rcx+3232]
+ vmovdqu ymm6, YMMWORD PTR [rcx+3264]
+ vmovdqu ymm7, YMMWORD PTR [rcx+3296]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+3328]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3360]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3392]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3424]
+ vmovdqu ymm4, YMMWORD PTR [rcx+3456]
+ vmovdqu ymm5, YMMWORD PTR [rcx+3488]
+ vmovdqu ymm6, YMMWORD PTR [rcx+3520]
+ vmovdqu ymm7, YMMWORD PTR [rcx+3552]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+416], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+3584]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3616]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3648]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3680]
+ vmovdqu ymm4, YMMWORD PTR [rcx+3712]
+ vmovdqu ymm5, YMMWORD PTR [rcx+3744]
+ vmovdqu ymm6, YMMWORD PTR [rcx+3776]
+ vmovdqu ymm7, YMMWORD PTR [rcx+3808]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+3840]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3872]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3904]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3936]
+ vmovdqu ymm4, YMMWORD PTR [rcx+3968]
+ vmovdqu ymm5, YMMWORD PTR [rcx+4000]
+ vmovdqu ymm6, YMMWORD PTR [rcx+4032]
+ vmovdqu ymm7, YMMWORD PTR [rcx+4064]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+480], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+4096]
+ vmovdqu ymm1, YMMWORD PTR [rcx+4128]
+ vmovdqu ymm2, YMMWORD PTR [rcx+4160]
+ vmovdqu ymm3, YMMWORD PTR [rcx+4192]
+ vmovdqu ymm4, YMMWORD PTR [rcx+4224]
+ vmovdqu ymm5, YMMWORD PTR [rcx+4256]
+ vmovdqu ymm6, YMMWORD PTR [rcx+4288]
+ vmovdqu ymm7, YMMWORD PTR [rcx+4320]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+512], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+4352]
+ vmovdqu ymm1, YMMWORD PTR [rcx+4384]
+ vmovdqu ymm2, YMMWORD PTR [rcx+4416]
+ vmovdqu ymm3, YMMWORD PTR [rcx+4448]
+ vmovdqu ymm4, YMMWORD PTR [rcx+4480]
+ vmovdqu ymm5, YMMWORD PTR [rcx+4512]
+ vmovdqu ymm6, YMMWORD PTR [rcx+4544]
+ vmovdqu ymm7, YMMWORD PTR [rcx+4576]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+544], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+4608]
+ vmovdqu ymm1, YMMWORD PTR [rcx+4640]
+ vmovdqu ymm2, YMMWORD PTR [rcx+4672]
+ vmovdqu ymm3, YMMWORD PTR [rcx+4704]
+ vmovdqu ymm4, YMMWORD PTR [rcx+4736]
+ vmovdqu ymm5, YMMWORD PTR [rcx+4768]
+ vmovdqu ymm6, YMMWORD PTR [rcx+4800]
+ vmovdqu ymm7, YMMWORD PTR [rcx+4832]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+576], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+4864]
+ vmovdqu ymm1, YMMWORD PTR [rcx+4896]
+ vmovdqu ymm2, YMMWORD PTR [rcx+4928]
+ vmovdqu ymm3, YMMWORD PTR [rcx+4960]
+ vmovdqu ymm4, YMMWORD PTR [rcx+4992]
+ vmovdqu ymm5, YMMWORD PTR [rcx+5024]
+ vmovdqu ymm6, YMMWORD PTR [rcx+5056]
+ vmovdqu ymm7, YMMWORD PTR [rcx+5088]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+608], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+5120]
+ vmovdqu ymm1, YMMWORD PTR [rcx+5152]
+ vmovdqu ymm2, YMMWORD PTR [rcx+5184]
+ vmovdqu ymm3, YMMWORD PTR [rcx+5216]
+ vmovdqu ymm4, YMMWORD PTR [rcx+5248]
+ vmovdqu ymm5, YMMWORD PTR [rcx+5280]
+ vmovdqu ymm6, YMMWORD PTR [rcx+5312]
+ vmovdqu ymm7, YMMWORD PTR [rcx+5344]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+640], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+5376]
+ vmovdqu ymm1, YMMWORD PTR [rcx+5408]
+ vmovdqu ymm2, YMMWORD PTR [rcx+5440]
+ vmovdqu ymm3, YMMWORD PTR [rcx+5472]
+ vmovdqu ymm4, YMMWORD PTR [rcx+5504]
+ vmovdqu ymm5, YMMWORD PTR [rcx+5536]
+ vmovdqu ymm6, YMMWORD PTR [rcx+5568]
+ vmovdqu ymm7, YMMWORD PTR [rcx+5600]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+672], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+5632]
+ vmovdqu ymm1, YMMWORD PTR [rcx+5664]
+ vmovdqu ymm2, YMMWORD PTR [rcx+5696]
+ vmovdqu ymm3, YMMWORD PTR [rcx+5728]
+ vmovdqu ymm4, YMMWORD PTR [rcx+5760]
+ vmovdqu ymm5, YMMWORD PTR [rcx+5792]
+ vmovdqu ymm6, YMMWORD PTR [rcx+5824]
+ vmovdqu ymm7, YMMWORD PTR [rcx+5856]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+704], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+5888]
+ vmovdqu ymm1, YMMWORD PTR [rcx+5920]
+ vmovdqu ymm2, YMMWORD PTR [rcx+5952]
+ vmovdqu ymm3, YMMWORD PTR [rcx+5984]
+ vmovdqu ymm4, YMMWORD PTR [rcx+6016]
+ vmovdqu ymm5, YMMWORD PTR [rcx+6048]
+ vmovdqu ymm6, YMMWORD PTR [rcx+6080]
+ vmovdqu ymm7, YMMWORD PTR [rcx+6112]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vpsllvd ymm0, ymm0, ymm9
+ vpsllvd ymm1, ymm1, ymm9
+ vpsllvd ymm2, ymm2, ymm9
+ vpsllvd ymm3, ymm3, ymm9
+ vpsllvd ymm4, ymm4, ymm9
+ vpsllvd ymm5, ymm5, ymm9
+ vpsllvd ymm6, ymm6, ymm9
+ vpsllvd ymm7, ymm7, ymm9
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+736], ymm0
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ ret
+wc_mldsa_vec_encode_eta_4_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_0 BYTE 00h, 0ffh, 0ffh, 0ffh, 00h, 0ffh, 0ffh, 0ffh
+ BYTE 00h, 01h, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh
+ BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh, 0ffh
+ BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_0 QWORD L_mldsa_decode_eta_2_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_1 BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh
+ BYTE 01h, 02h, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh, 0ffh
+ BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_1 QWORD L_mldsa_decode_eta_2_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_2 BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ BYTE 02h, 03h, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh, 0ffh
+ BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_2 QWORD L_mldsa_decode_eta_2_avx2_shuff_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_3 BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ BYTE 03h, 04h, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh, 0ffh
+ BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_3 QWORD L_mldsa_decode_eta_2_avx2_shuff_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_4 BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ BYTE 04h, 05h, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh, 0ffh
+ BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_4 QWORD L_mldsa_decode_eta_2_avx2_shuff_4
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_5 BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 06h, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh, 0ffh
+ BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_5 QWORD L_mldsa_decode_eta_2_avx2_shuff_5
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_6 BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ BYTE 06h, 07h, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh, 0ffh
+ BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_6 QWORD L_mldsa_decode_eta_2_avx2_shuff_6
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_shuff_7 BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ BYTE 07h, 08h, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh
+ BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh, 0ffh
+ BYTE 09h, 0ffh, 0ffh, 0ffh, 09h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_2_avx2_shuff_7 QWORD L_mldsa_decode_eta_2_avx2_shuff_7
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_two DWORD 00000002h, 00000002h, 00000002h, 00000002h
+ DWORD 00000002h, 00000002h, 00000002h, 00000002h
+ptr_L_mldsa_decode_eta_2_avx2_two QWORD L_mldsa_decode_eta_2_avx2_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_vs DWORD 00000000h, 00000003h, 00000006h, 00000001h
+ DWORD 00000004h, 00000007h, 00000002h, 00000005h
+ptr_L_mldsa_decode_eta_2_avx2_vs QWORD L_mldsa_decode_eta_2_avx2_vs
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_2_avx2_mask DWORD 00000007h, 00000007h, 00000007h, 00000007h
+ DWORD 00000007h, 00000007h, 00000007h, 00000007h
+ptr_L_mldsa_decode_eta_2_avx2_mask QWORD L_mldsa_decode_eta_2_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_eta_2_avx2 PROC
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vpxor ymm4, ymm4, ymm4
+ vmovdqu ymm4, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_0
+ vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_1
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_2
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_3
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_4
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_5
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_6
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_7
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_eta_2_avx2_two
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_eta_2_avx2_vs
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_decode_eta_2_avx2_mask
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm4
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm7
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+32], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm10
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+64], ymm3
+ vpermq ymm0, ymm0, 57
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm8
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm11
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+160], ymm3
+ vpermq ymm0, ymm0, 57
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm6
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+192], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm9
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+224], ymm3
+ vpermq ymm0, ymm0, 57
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm4
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm3
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm7
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+288], ymm3
+ vperm2i128 ymm0, ymm0, ymm1, 32
+ vpermq ymm0, ymm0, 56
+ vpermq ymm3, ymm0, 68
+ vpshufb ymm3, ymm3, ymm10
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+320], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm8
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm11
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+416], ymm3
+ vpermq ymm1, ymm1, 57
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm6
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+448], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm9
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+480], ymm3
+ vpermq ymm1, ymm1, 57
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm4
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+512], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm7
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+544], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm10
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+576], ymm3
+ vpermq ymm1, ymm1, 57
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+608], ymm3
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm8
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+640], ymm3
+ vperm2i128 ymm1, ymm1, ymm2, 32
+ vpermq ymm1, ymm1, 56
+ vpermq ymm3, ymm1, 68
+ vpshufb ymm3, ymm3, ymm11
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+672], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm6
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+704], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm9
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+736], ymm3
+ vpermq ymm2, ymm2, 57
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm4
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+768], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm7
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+800], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm10
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+832], ymm3
+ vpermq ymm2, ymm2, 57
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+864], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm8
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+896], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm11
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+928], ymm3
+ vpermq ymm2, ymm2, 57
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm6
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+960], ymm3
+ vpermq ymm3, ymm2, 68
+ vpshufb ymm3, ymm3, ymm9
+ vpsrlvd ymm3, ymm3, ymm13
+ vpand ymm3, ymm3, ymm14
+ vpsubd ymm3, ymm12, ymm3
+ vmovdqu YMMWORD PTR [rdx+992], ymm3
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ ret
+wc_mldsa_decode_eta_2_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_0 BYTE 00h, 0ffh, 0ffh, 0ffh, 00h, 0ffh, 0ffh, 0ffh
+ BYTE 00h, 01h, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh
+ BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh, 0ffh
+ BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_0 QWORD L_mldsa_decode_eta_4_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_1 BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh
+ BYTE 01h, 02h, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh, 0ffh
+ BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_1 QWORD L_mldsa_decode_eta_4_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_2 BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh
+ BYTE 02h, 03h, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh, 0ffh
+ BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_2 QWORD L_mldsa_decode_eta_4_avx2_shuff_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_3 BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh
+ BYTE 03h, 04h, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh, 0ffh
+ BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_3 QWORD L_mldsa_decode_eta_4_avx2_shuff_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_4 BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh
+ BYTE 04h, 05h, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh, 0ffh
+ BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_4 QWORD L_mldsa_decode_eta_4_avx2_shuff_4
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_5 BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 06h, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh, 0ffh
+ BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_5 QWORD L_mldsa_decode_eta_4_avx2_shuff_5
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_6 BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh
+ BYTE 06h, 07h, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh, 0ffh
+ BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_6 QWORD L_mldsa_decode_eta_4_avx2_shuff_6
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_shuff_7 BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh
+ BYTE 07h, 08h, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh
+ BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh, 0ffh
+ BYTE 09h, 0ffh, 0ffh, 0ffh, 09h, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_decode_eta_4_avx2_shuff_7 QWORD L_mldsa_decode_eta_4_avx2_shuff_7
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_four DWORD 00000004h, 00000004h, 00000004h, 00000004h
+ DWORD 00000004h, 00000004h, 00000004h, 00000004h
+ptr_L_mldsa_decode_eta_4_avx2_four QWORD L_mldsa_decode_eta_4_avx2_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_vs DWORD 00000000h, 00000004h, 00000008h, 0000000ch
+ DWORD 00000010h, 00000014h, 00000018h, 0000001ch
+ptr_L_mldsa_decode_eta_4_avx2_vs QWORD L_mldsa_decode_eta_4_avx2_vs
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_eta_4_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ptr_L_mldsa_decode_eta_4_avx2_mask QWORD L_mldsa_decode_eta_4_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_eta_4_avx2 PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_eta_4_avx2_four
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_eta_4_avx2_vs
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_eta_4_avx2_mask
+ vpbroadcastd ymm0, DWORD PTR [rcx]
+ vpbroadcastd ymm1, DWORD PTR [rcx+4]
+ vpbroadcastd ymm2, DWORD PTR [rcx+8]
+ vpbroadcastd ymm3, DWORD PTR [rcx+12]
+ vpbroadcastd ymm4, DWORD PTR [rcx+16]
+ vpbroadcastd ymm5, DWORD PTR [rcx+20]
+ vpbroadcastd ymm6, DWORD PTR [rcx+24]
+ vpbroadcastd ymm7, DWORD PTR [rcx+28]
+ vpsrlvd ymm0, ymm0, ymm9
+ vpsrlvd ymm1, ymm1, ymm9
+ vpsrlvd ymm2, ymm2, ymm9
+ vpsrlvd ymm3, ymm3, ymm9
+ vpsrlvd ymm4, ymm4, ymm9
+ vpsrlvd ymm5, ymm5, ymm9
+ vpsrlvd ymm6, ymm6, ymm9
+ vpsrlvd ymm7, ymm7, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm10
+ vpand ymm7, ymm7, ymm10
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu YMMWORD PTR [rdx+192], ymm6
+ vmovdqu YMMWORD PTR [rdx+224], ymm7
+ vpbroadcastd ymm0, DWORD PTR [rcx+32]
+ vpbroadcastd ymm1, DWORD PTR [rcx+36]
+ vpbroadcastd ymm2, DWORD PTR [rcx+40]
+ vpbroadcastd ymm3, DWORD PTR [rcx+44]
+ vpbroadcastd ymm4, DWORD PTR [rcx+48]
+ vpbroadcastd ymm5, DWORD PTR [rcx+52]
+ vpbroadcastd ymm6, DWORD PTR [rcx+56]
+ vpbroadcastd ymm7, DWORD PTR [rcx+60]
+ vpsrlvd ymm0, ymm0, ymm9
+ vpsrlvd ymm1, ymm1, ymm9
+ vpsrlvd ymm2, ymm2, ymm9
+ vpsrlvd ymm3, ymm3, ymm9
+ vpsrlvd ymm4, ymm4, ymm9
+ vpsrlvd ymm5, ymm5, ymm9
+ vpsrlvd ymm6, ymm6, ymm9
+ vpsrlvd ymm7, ymm7, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm10
+ vpand ymm7, ymm7, ymm10
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm2
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu YMMWORD PTR [rdx+448], ymm6
+ vmovdqu YMMWORD PTR [rdx+480], ymm7
+ vpbroadcastd ymm0, DWORD PTR [rcx+64]
+ vpbroadcastd ymm1, DWORD PTR [rcx+68]
+ vpbroadcastd ymm2, DWORD PTR [rcx+72]
+ vpbroadcastd ymm3, DWORD PTR [rcx+76]
+ vpbroadcastd ymm4, DWORD PTR [rcx+80]
+ vpbroadcastd ymm5, DWORD PTR [rcx+84]
+ vpbroadcastd ymm6, DWORD PTR [rcx+88]
+ vpbroadcastd ymm7, DWORD PTR [rcx+92]
+ vpsrlvd ymm0, ymm0, ymm9
+ vpsrlvd ymm1, ymm1, ymm9
+ vpsrlvd ymm2, ymm2, ymm9
+ vpsrlvd ymm3, ymm3, ymm9
+ vpsrlvd ymm4, ymm4, ymm9
+ vpsrlvd ymm5, ymm5, ymm9
+ vpsrlvd ymm6, ymm6, ymm9
+ vpsrlvd ymm7, ymm7, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm10
+ vpand ymm7, ymm7, ymm10
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vmovdqu YMMWORD PTR [rdx+512], ymm0
+ vmovdqu YMMWORD PTR [rdx+544], ymm1
+ vmovdqu YMMWORD PTR [rdx+576], ymm2
+ vmovdqu YMMWORD PTR [rdx+608], ymm3
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ vmovdqu YMMWORD PTR [rdx+672], ymm5
+ vmovdqu YMMWORD PTR [rdx+704], ymm6
+ vmovdqu YMMWORD PTR [rdx+736], ymm7
+ vpbroadcastd ymm0, DWORD PTR [rcx+96]
+ vpbroadcastd ymm1, DWORD PTR [rcx+100]
+ vpbroadcastd ymm2, DWORD PTR [rcx+104]
+ vpbroadcastd ymm3, DWORD PTR [rcx+108]
+ vpbroadcastd ymm4, DWORD PTR [rcx+112]
+ vpbroadcastd ymm5, DWORD PTR [rcx+116]
+ vpbroadcastd ymm6, DWORD PTR [rcx+120]
+ vpbroadcastd ymm7, DWORD PTR [rcx+124]
+ vpsrlvd ymm0, ymm0, ymm9
+ vpsrlvd ymm1, ymm1, ymm9
+ vpsrlvd ymm2, ymm2, ymm9
+ vpsrlvd ymm3, ymm3, ymm9
+ vpsrlvd ymm4, ymm4, ymm9
+ vpsrlvd ymm5, ymm5, ymm9
+ vpsrlvd ymm6, ymm6, ymm9
+ vpsrlvd ymm7, ymm7, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm10
+ vpand ymm7, ymm7, ymm10
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsubd ymm4, ymm8, ymm4
+ vpsubd ymm5, ymm8, ymm5
+ vpsubd ymm6, ymm8, ymm6
+ vpsubd ymm7, ymm8, ymm7
+ vmovdqu YMMWORD PTR [rdx+768], ymm0
+ vmovdqu YMMWORD PTR [rdx+800], ymm1
+ vmovdqu YMMWORD PTR [rdx+832], ymm2
+ vmovdqu YMMWORD PTR [rdx+864], ymm3
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ vmovdqu YMMWORD PTR [rdx+928], ymm5
+ vmovdqu YMMWORD PTR [rdx+960], ymm6
+ vmovdqu YMMWORD PTR [rdx+992], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+wc_mldsa_decode_eta_4_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_88_avx2_shuff_0_even BYTE 00h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 00h, 09h, 0ah, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_w1_88_avx2_shuff_0_even QWORD L_mldsa_encode_w1_88_avx2_shuff_0_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_88_avx2_shuff_0_odd BYTE 04h, 05h, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 04h, 05h, 0eh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_w1_88_avx2_shuff_0_odd QWORD L_mldsa_encode_w1_88_avx2_shuff_0_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_88_avx2_shuff_1_even BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 09h
+ BYTE 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 00h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_w1_88_avx2_shuff_1_even QWORD L_mldsa_encode_w1_88_avx2_shuff_1_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_88_avx2_shuff_1_odd BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 04h, 05h
+ BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 04h, 05h, 0eh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_w1_88_avx2_shuff_1_odd QWORD L_mldsa_encode_w1_88_avx2_shuff_1_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_88_avx2_vs DWORD 00000000h, 00000006h, 0000000ch, 00000012h
+ DWORD 00000000h, 00000006h, 0000000ch, 00000012h
+ptr_L_mldsa_encode_w1_88_avx2_vs QWORD L_mldsa_encode_w1_88_avx2_vs
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_encode_w1_88_avx2 PROC
+ sub rsp, 48
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vpxor ymm4, ymm4, ymm4
+ vmovdqu ymm4, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_0_even
+ vmovdqu ymm5, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_0_odd
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_1_even
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_1_odd
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_w1_88_avx2_vs
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+96]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+192]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+320]
+ vmovdqu ymm1, YMMWORD PTR [rcx+352]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+448]
+ vmovdqu ymm1, YMMWORD PTR [rcx+480]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+576]
+ vmovdqu ymm1, YMMWORD PTR [rcx+608]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+704]
+ vmovdqu ymm1, YMMWORD PTR [rcx+736]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+832]
+ vmovdqu ymm1, YMMWORD PTR [rcx+864]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vmovdqu ymm0, YMMWORD PTR [rcx+960]
+ vmovdqu ymm1, YMMWORD PTR [rcx+992]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpshufb ymm2, ymm0, ymm5
+ vpshufb ymm0, ymm0, ymm4
+ vpshufb ymm3, ymm1, ymm7
+ vpshufb ymm1, ymm1, ymm6
+ vpor ymm0, ymm0, ymm2
+ vpor ymm0, ymm0, ymm3
+ vpor ymm0, ymm0, ymm1
+ vextracti128 xmm2, ymm0, 1
+ vpor ymm0, ymm0, ymm2
+ vmovq QWORD PTR [rdx], xmm0
+ vpextrd DWORD PTR [rdx+8], xmm0, 2
+ add rdx, 12
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ ret
+wc_mldsa_encode_w1_88_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_w1_32_avx2_vs_4 DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ptr_L_mldsa_encode_w1_32_avx2_vs_4 QWORD L_mldsa_encode_w1_32_avx2_vs_4
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_encode_w1_32_avx2 PROC
+ sub rsp, 48
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_w1_32_avx2_vs_4
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpsllvd ymm4, ymm4, ymm8
+ vpsllvd ymm5, ymm5, ymm8
+ vpsllvd ymm6, ymm6, ymm8
+ vpsllvd ymm7, ymm7, ymm8
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpsllvd ymm4, ymm4, ymm8
+ vpsllvd ymm5, ymm5, ymm8
+ vpsllvd ymm6, ymm6, ymm8
+ vpsllvd ymm7, ymm7, ymm8
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+32], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpsllvd ymm4, ymm4, ymm8
+ vpsllvd ymm5, ymm5, ymm8
+ vpsllvd ymm6, ymm6, ymm8
+ vpsllvd ymm7, ymm7, ymm8
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vpsllvd ymm0, ymm0, ymm8
+ vpsllvd ymm1, ymm1, ymm8
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpsllvd ymm4, ymm4, ymm8
+ vpsllvd ymm5, ymm5, ymm8
+ vpsllvd ymm6, ymm6, ymm8
+ vpsllvd ymm7, ymm7, ymm8
+ vpackusdw ymm0, ymm0, ymm1
+ vpackusdw ymm1, ymm2, ymm3
+ vpackusdw ymm2, ymm4, ymm5
+ vpackusdw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpermq ymm2, ymm2, 216
+ vpermq ymm3, ymm3, 216
+ vphaddw ymm0, ymm0, ymm1
+ vphaddw ymm1, ymm2, ymm3
+ vphaddw ymm2, ymm4, ymm5
+ vphaddw ymm3, ymm6, ymm7
+ vpermq ymm0, ymm0, 216
+ vpermq ymm1, ymm1, 216
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 216
+ vmovdqu YMMWORD PTR [rdx+96], ymm0
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ ret
+wc_mldsa_encode_w1_32_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_d_max_half_m1 DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ptr_L_mldsa_encode_t0_t1_avx2_d_max_half_m1 QWORD L_mldsa_encode_t0_t1_avx2_d_max_half_m1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_d_max_half DWORD 00001000h, 00001000h, 00001000h, 00001000h
+ DWORD 00001000h, 00001000h, 00001000h, 00001000h
+ptr_L_mldsa_encode_t0_t1_avx2_d_max_half QWORD L_mldsa_encode_t0_t1_avx2_d_max_half
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_vs_13 DWORD 00000000h, 0000000dh, 00000002h, 0000000fh
+ DWORD 00000004h, 00000011h, 00000006h, 00000013h
+ptr_L_mldsa_encode_t0_t1_avx2_vs_13 QWORD L_mldsa_encode_t0_t1_avx2_vs_13
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_shuff_13_even BYTE 00h, 01h, 0ffh, 08h, 09h, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h
+ BYTE 02h, 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_t0_t1_avx2_shuff_13_even QWORD L_mldsa_encode_t0_t1_avx2_shuff_13_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_shuff_13_odd BYTE 0ffh, 05h, 06h, 07h, 0dh, 0eh, 0fh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 06h, 07h, 0ffh, 0eh, 0fh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_t0_t1_avx2_shuff_13_odd QWORD L_mldsa_encode_t0_t1_avx2_shuff_13_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_vs_10 DWORD 00000000h, 0000000ah, 00000004h, 0000000eh
+ DWORD 00000000h, 0000000ah, 00000004h, 0000000eh
+ptr_L_mldsa_encode_t0_t1_avx2_vs_10 QWORD L_mldsa_encode_t0_t1_avx2_vs_10
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_shuff_10_even BYTE 00h, 01h, 08h, 09h, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h, 08h
+ BYTE 09h, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_t0_t1_avx2_shuff_10_even QWORD L_mldsa_encode_t0_t1_avx2_shuff_10_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_t0_t1_avx2_shuff_10_odd BYTE 0ffh, 05h, 06h, 0dh, 0eh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 05h, 06h
+ BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_t0_t1_avx2_shuff_10_odd QWORD L_mldsa_encode_t0_t1_avx2_shuff_10_odd
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_vec_encode_t0_t1_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_d_max_half_m1
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_d_max_half
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_vs_13
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_13_even
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_13_odd
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_vs_10
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_10_even
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_10_odd
+L_mldsa_encode_t0_t1_avx2_loop:
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+96]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+192]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+320]
+ vmovdqu ymm1, YMMWORD PTR [rcx+352]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+448]
+ vmovdqu ymm1, YMMWORD PTR [rcx+480]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+576]
+ vmovdqu ymm1, YMMWORD PTR [rcx+608]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+704]
+ vmovdqu ymm1, YMMWORD PTR [rcx+736]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+832]
+ vmovdqu ymm1, YMMWORD PTR [rcx+864]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ vmovdqu ymm0, YMMWORD PTR [rcx+960]
+ vmovdqu ymm1, YMMWORD PTR [rcx+992]
+ vpaddd ymm4, ymm0, ymm6
+ vpaddd ymm5, ymm1, ymm6
+ vpsrld ymm4, ymm4, 13
+ vpsrld ymm5, ymm5, 13
+ vpslld ymm2, ymm4, 13
+ vpslld ymm3, ymm5, 13
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm7, ymm2
+ vpsubd ymm3, ymm7, ymm3
+ vpsllvd ymm2, ymm2, ymm8
+ vpsllvd ymm3, ymm3, ymm8
+ vpshufb ymm0, ymm2, ymm10
+ vpshufb ymm1, ymm3, ymm10
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vextracti128 xmm0, ymm2, 1
+ vextracti128 xmm1, ymm3, 1
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu OWORD PTR [r8], xmm2
+ add r8, 13
+ vmovdqu OWORD PTR [r8], xmm3
+ add r8, 13
+ vpsllvd ymm4, ymm4, ymm11
+ vpsllvd ymm5, ymm5, ymm11
+ vpshufb ymm0, ymm4, ymm13
+ vpshufb ymm1, ymm5, ymm13
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vextracti128 xmm0, ymm4, 1
+ vextracti128 xmm1, ymm5, 1
+ vpor ymm4, ymm4, ymm0
+ vpor ymm5, ymm5, ymm1
+ vmovdqu OWORD PTR [r9], xmm4
+ add r9, 10
+ vmovdqu OWORD PTR [r9], xmm5
+ add r9, 10
+ add rcx, 1024
+ dec dl
+ jnz L_mldsa_encode_t0_t1_avx2_loop
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+wc_mldsa_vec_encode_t0_t1_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_0 BYTE 00h, 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 03h
+ BYTE 03h, 04h, 05h, 0ffh, 04h, 05h, 06h, 07h
+ BYTE 06h, 07h, 08h, 0ffh, 0ffh, 08h, 09h, 0ffh
+ BYTE 09h, 0ah, 0bh, 0ffh, 0ffh, 0ffh, 0bh, 0ch
+ptr_L_mldsa_decode_t0_avx2_shuff_0 QWORD L_mldsa_decode_t0_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_1 BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 08h
+ BYTE 08h, 09h, 0ffh, 0ffh, 09h, 0ah, 0bh, 0ffh
+ BYTE 03h, 04h, 05h, 0ffh, 0ffh, 05h, 06h, 0ffh
+ BYTE 06h, 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h
+ptr_L_mldsa_decode_t0_avx2_shuff_1 QWORD L_mldsa_decode_t0_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_2 BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 05h
+ BYTE 05h, 06h, 0ffh, 0ffh, 06h, 07h, 08h, 0ffh
+ BYTE 00h, 01h, 02h, 0ffh, 0ffh, 02h, 03h, 0ffh
+ BYTE 03h, 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h
+ptr_L_mldsa_decode_t0_avx2_shuff_2 QWORD L_mldsa_decode_t0_avx2_shuff_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_3 BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ah
+ BYTE 0ah, 0bh, 0ffh, 0ffh, 0bh, 0ch, 0dh, 0ffh
+ BYTE 05h, 06h, 07h, 0ffh, 0ffh, 07h, 08h, 0ffh
+ BYTE 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ah, 0bh
+ptr_L_mldsa_decode_t0_avx2_shuff_3 QWORD L_mldsa_decode_t0_avx2_shuff_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_4 BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 07h
+ BYTE 07h, 08h, 0ffh, 0ffh, 08h, 09h, 0ah, 0ffh
+ BYTE 02h, 03h, 04h, 0ffh, 0ffh, 04h, 05h, 0ffh
+ BYTE 05h, 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h
+ptr_L_mldsa_decode_t0_avx2_shuff_4 QWORD L_mldsa_decode_t0_avx2_shuff_4
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_5 BYTE 01h, 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 04h
+ BYTE 04h, 05h, 0ffh, 0ffh, 05h, 06h, 07h, 0ffh
+ BYTE 07h, 08h, 09h, 0ffh, 0ffh, 09h, 0ah, 0ffh
+ BYTE 0ah, 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ch, 0dh
+ptr_L_mldsa_decode_t0_avx2_shuff_5 QWORD L_mldsa_decode_t0_avx2_shuff_5
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_6 BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 09h
+ BYTE 09h, 0ah, 0ffh, 0ffh, 0ah, 0bh, 0ch, 0ffh
+ BYTE 04h, 05h, 06h, 0ffh, 0ffh, 06h, 07h, 08h
+ BYTE 07h, 08h, 09h, 0ffh, 0ffh, 0ffh, 09h, 0ah
+ptr_L_mldsa_decode_t0_avx2_shuff_6 QWORD L_mldsa_decode_t0_avx2_shuff_6
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_shuff_7 BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 06h
+ BYTE 06h, 07h, 0ffh, 0ffh, 07h, 08h, 09h, 0ffh
+ BYTE 01h, 02h, 03h, 0ffh, 0ffh, 03h, 04h, 0ffh
+ BYTE 04h, 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h
+ptr_L_mldsa_decode_t0_avx2_shuff_7 QWORD L_mldsa_decode_t0_avx2_shuff_7
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_vs_8 DWORD 00000000h, 0000000dh, 00000002h, 00000007h
+ DWORD 00000004h, 00000009h, 00000006h, 00000013h
+ptr_L_mldsa_decode_t0_avx2_vs_8 QWORD L_mldsa_decode_t0_avx2_vs_8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_mask DWORD 00001fffh, 00001fffh, 00001fffh, 00001fffh
+ DWORD 00001fffh, 00001fffh, 00001fffh, 00001fffh
+ptr_L_mldsa_decode_t0_avx2_mask QWORD L_mldsa_decode_t0_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t0_avx2_d_max_half DWORD 00001000h, 00001000h, 00001000h, 00001000h
+ DWORD 00001000h, 00001000h, 00001000h, 00001000h
+ptr_L_mldsa_decode_t0_avx2_d_max_half QWORD L_mldsa_decode_t0_avx2_d_max_half
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_t0_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vpxor ymm5, ymm5, ymm5
+ vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_0
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_1
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_2
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_3
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_4
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_5
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_6
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_7
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_t0_avx2_vs_8
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_decode_t0_avx2_mask
+ vmovdqu ymm15, YMMWORD PTR L_mldsa_decode_t0_avx2_d_max_half
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; 1/32
+ vpermq ymm4, ymm0, 68
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm4
+ ; 2/32
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ ; 3/32
+ vperm2i128 ymm0, ymm0, ymm1, 33
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+64], ymm4
+ ; 4/32
+ vpermq ymm4, ymm1, 148
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+96], ymm4
+ ; 5/32
+ vperm2i128 ymm1, ymm1, ymm2, 33
+ vpermq ymm4, ymm1, 148
+ vpshufb ymm4, ymm4, ymm9
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ ; 6/32
+ vpermq ymm4, ymm2, 68
+ vpshufb ymm4, ymm4, ymm10
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm4
+ ; 7/32
+ vpermq ymm4, ymm2, 233
+ vpshufb ymm4, ymm4, ymm11
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+192], ymm4
+ ; 8/32
+ vperm2i128 ymm2, ymm2, ymm3, 33
+ vpermq ymm4, ymm2, 233
+ vpshufb ymm4, ymm4, ymm12
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+224], ymm4
+ ; 9/32
+ vpermq ymm4, ymm3, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ ; 10/32
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vperm2i128 ymm3, ymm3, ymm0, 33
+ vpermq ymm4, ymm3, 148
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm4
+ ; 11/32
+ vpermq ymm4, ymm0, 148
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+320], ymm4
+ ; 12/32
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+352], ymm4
+ ; 13/32
+ vperm2i128 ymm0, ymm0, ymm1, 33
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm9
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ ; 14/32
+ vpermq ymm4, ymm1, 153
+ vpshufb ymm4, ymm4, ymm10
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm4
+ ; 15/32
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vperm2i128 ymm1, ymm1, ymm2, 33
+ vpermq ymm4, ymm1, 148
+ vpshufb ymm4, ymm4, ymm11
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+448], ymm4
+ ; 16/32
+ vpermq ymm4, ymm2, 148
+ vpshufb ymm4, ymm4, ymm12
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+480], ymm4
+ ; 17/32
+ vpermq ymm4, ymm2, 238
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+512], ymm4
+ ; 18/32
+ vperm2i128 ymm2, ymm2, ymm3, 33
+ vpermq ymm4, ymm2, 233
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+544], ymm4
+ ; 19/32
+ vpermq ymm4, ymm3, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+576], ymm4
+ ; 20/32
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vperm2i128 ymm3, ymm3, ymm0, 33
+ vpermq ymm4, ymm3, 148
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+608], ymm4
+ ; 21/32
+ vpermq ymm4, ymm0, 148
+ vpshufb ymm4, ymm4, ymm9
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ ; 22/32
+ vpermq ymm4, ymm0, 238
+ vpshufb ymm4, ymm4, ymm10
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+672], ymm4
+ ; 23/32
+ vperm2i128 ymm0, ymm0, ymm1, 33
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm11
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+704], ymm4
+ ; 24/32
+ vpermq ymm4, ymm1, 233
+ vpshufb ymm4, ymm4, ymm12
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+736], ymm4
+ ; 25/32
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vperm2i128 ymm1, ymm1, ymm2, 33
+ vpermq ymm4, ymm1, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+768], ymm4
+ ; 26/32
+ vpermq ymm4, ymm2, 148
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+800], ymm4
+ ; 27/32
+ vpermq ymm4, ymm2, 62
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+832], ymm4
+ ; 28/32
+ vperm2i128 ymm2, ymm2, ymm3, 33
+ vpermq ymm4, ymm2, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+864], ymm4
+ ; 29/32
+ vpermq ymm4, ymm3, 233
+ vpshufb ymm4, ymm4, ymm9
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ ; 30/32
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vperm2i128 ymm3, ymm3, ymm0, 33
+ vpermq ymm4, ymm3, 153
+ vpshufb ymm4, ymm4, ymm10
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+928], ymm4
+ ; 31/32
+ vpermq ymm4, ymm0, 148
+ vpshufb ymm4, ymm4, ymm11
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+960], ymm4
+ ; 32/32
+ vpermq ymm4, ymm0, 62
+ vpshufb ymm4, ymm4, ymm12
+ vpsrlvd ymm4, ymm4, ymm13
+ vpand ymm4, ymm4, ymm14
+ vpsubd ymm4, ymm15, ymm4
+ vmovdqu YMMWORD PTR [rdx+992], ymm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_decode_t0_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_shuff_0 BYTE 00h, 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh
+ BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh
+ BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh
+ BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh
+ptr_L_mldsa_decode_t1_avx2_shuff_0 QWORD L_mldsa_decode_t1_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_shuff_1 BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh
+ BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh
+ BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh
+ BYTE 09h, 0ah, 08h, 0ffh, 0ffh, 0ah, 0bh, 0ffh
+ptr_L_mldsa_decode_t1_avx2_shuff_1 QWORD L_mldsa_decode_t1_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_shuff_2 BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh
+ BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh
+ BYTE 01h, 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh
+ BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh
+ptr_L_mldsa_decode_t1_avx2_shuff_2 QWORD L_mldsa_decode_t1_avx2_shuff_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_shuff_3 BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh
+ BYTE 08h, 09h, 0ffh, 0ffh, 0ffh, 09h, 0ah, 0ffh
+ BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh
+ BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh
+ptr_L_mldsa_decode_t1_avx2_shuff_3 QWORD L_mldsa_decode_t1_avx2_shuff_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_vs_8 DWORD 00000000h, 0000000ah, 00000004h, 0000000eh
+ DWORD 00000000h, 0000000ah, 00000004h, 0000000eh
+ptr_L_mldsa_decode_t1_avx2_vs_8 QWORD L_mldsa_decode_t1_avx2_vs_8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_t1_avx2_mask DWORD 000003ffh, 000003ffh, 000003ffh, 000003ffh
+ DWORD 000003ffh, 000003ffh, 000003ffh, 000003ffh
+ptr_L_mldsa_decode_t1_avx2_mask QWORD L_mldsa_decode_t1_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_t1_avx2 PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vpxor ymm5, ymm5, ymm5
+ vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_0
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_1
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_2
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_3
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_t1_avx2_vs_8
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_t1_avx2_mask
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; 1/32
+ vpermq ymm4, ymm0, 68
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx], ymm4
+ ; 2/32
+ vpermq ymm4, ymm0, 153
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ ; 3/32
+ vpermq ymm4, ymm0, 62
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+64], ymm4
+ ; 4/32
+ vperm2i128 ymm0, ymm0, ymm1, 33
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+96], ymm4
+ ; 5/32
+ vpermq ymm4, ymm1, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ ; 6/32
+ vpermq ymm4, ymm1, 238
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+160], ymm4
+ ; 7/32
+ vperm2i128 ymm1, ymm1, ymm2, 33
+ vpermq ymm4, ymm1, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+192], ymm4
+ ; 8/32
+ vpermq ymm4, ymm2, 148
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+224], ymm4
+ ; 9/32
+ vpermq ymm4, ymm2, 238
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ ; 10/32
+ vperm2i128 ymm2, ymm2, ymm3, 33
+ vpermq ymm4, ymm2, 153
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+288], ymm4
+ ; 11/32
+ vpermq ymm4, ymm3, 148
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+320], ymm4
+ ; 12/32
+ vpermq ymm4, ymm3, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+352], ymm4
+ ; 13/32
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vperm2i128 ymm3, ymm3, ymm0, 33
+ vpermq ymm4, ymm3, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ ; 14/32
+ vpermq ymm4, ymm0, 68
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+416], ymm4
+ ; 15/32
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+448], ymm4
+ ; 16/32
+ vpermq ymm4, ymm0, 62
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+480], ymm4
+ ; 17/32
+ vpermq ymm4, ymm1, 68
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+512], ymm4
+ ; 18/32
+ vpermq ymm4, ymm1, 153
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+544], ymm4
+ ; 19/32
+ vpermq ymm4, ymm1, 62
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+576], ymm4
+ ; 20/32
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vperm2i128 ymm1, ymm1, ymm2, 33
+ vpermq ymm4, ymm1, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+608], ymm4
+ ; 21/32
+ vpermq ymm4, ymm2, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ ; 22/32
+ vpermq ymm4, ymm2, 238
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+672], ymm4
+ ; 23/32
+ vperm2i128 ymm2, ymm2, ymm3, 33
+ vpermq ymm4, ymm2, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+704], ymm4
+ ; 24/32
+ vpermq ymm4, ymm3, 148
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+736], ymm4
+ ; 25/32
+ vpermq ymm4, ymm3, 238
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+768], ymm4
+ ; 26/32
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vperm2i128 ymm3, ymm3, ymm0, 33
+ vpermq ymm4, ymm3, 153
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+800], ymm4
+ ; 27/32
+ vpermq ymm4, ymm0, 148
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+832], ymm4
+ ; 28/32
+ vpermq ymm4, ymm0, 233
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+864], ymm4
+ ; 29/32
+ vperm2i128 ymm0, ymm0, ymm1, 33
+ vpermq ymm4, ymm0, 153
+ vpshufb ymm4, ymm4, ymm5
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ ; 30/32
+ vpermq ymm4, ymm1, 68
+ vpshufb ymm4, ymm4, ymm6
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+928], ymm4
+ ; 31/32
+ vpermq ymm4, ymm1, 233
+ vpshufb ymm4, ymm4, ymm7
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+960], ymm4
+ ; 32/32
+ vpermq ymm4, ymm1, 62
+ vpshufb ymm4, ymm4, ymm8
+ vpsrlvd ymm4, ymm4, ymm9
+ vpand ymm4, ymm4, ymm10
+ vpslld ymm4, ymm4, 13
+ vmovdqu YMMWORD PTR [rdx+992], ymm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+wc_mldsa_decode_t1_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_shuff_0 BYTE 00h, 01h, 02h, 0ffh, 02h, 03h, 04h, 0ffh
+ BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh
+ BYTE 0ffh, 01h, 02h, 03h, 0ffh, 03h, 04h, 05h
+ BYTE 0ffh, 05h, 06h, 07h, 0ffh, 07h, 08h, 09h
+ptr_L_mldsa_decode_gamma1_17_avx2_shuff_0 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_shuff_1 BYTE 02h, 03h, 04h, 0ffh, 04h, 05h, 06h, 0ffh
+ BYTE 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah, 0ffh
+ BYTE 0ffh, 03h, 04h, 05h, 0ffh, 05h, 06h, 07h
+ BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh
+ptr_L_mldsa_decode_gamma1_17_avx2_shuff_1 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_shuff_2 BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh
+ BYTE 08h, 09h, 0ah, 0ffh, 0ah, 0bh, 0ch, 0ffh
+ BYTE 0ffh, 05h, 06h, 07h, 0ffh, 07h, 08h, 09h
+ BYTE 0ffh, 09h, 0ah, 0bh, 0ffh, 0bh, 0ch, 0dh
+ptr_L_mldsa_decode_gamma1_17_avx2_shuff_2 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_shuff_3 BYTE 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah, 0ffh
+ BYTE 0ah, 0bh, 0ch, 0ffh, 0ch, 0dh, 0eh, 0ffh
+ BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh
+ BYTE 0ffh, 0bh, 0ch, 0dh, 0ffh, 0dh, 0eh, 0fh
+ptr_L_mldsa_decode_gamma1_17_avx2_shuff_3 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_vs_8 DWORD 00000000h, 00000002h, 00000004h, 00000006h
+ DWORD 00000008h, 0000000ah, 0000000ch, 0000000eh
+ptr_L_mldsa_decode_gamma1_17_avx2_vs_8 QWORD L_mldsa_decode_gamma1_17_avx2_vs_8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_mask DWORD 0003ffffh, 0003ffffh, 0003ffffh, 0003ffffh
+ DWORD 0003ffffh, 0003ffffh, 0003ffffh, 0003ffffh
+ptr_L_mldsa_decode_gamma1_17_avx2_mask QWORD L_mldsa_decode_gamma1_17_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_17_avx2_gamma17 DWORD 00020000h, 00020000h, 00020000h, 00020000h
+ DWORD 00020000h, 00020000h, 00020000h, 00020000h
+ptr_L_mldsa_decode_gamma1_17_avx2_gamma17 QWORD L_mldsa_decode_gamma1_17_avx2_gamma17
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_gamma1_17_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_0
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_1
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_2
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_3
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_vs_8
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_mask
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_gamma17
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ ; 0/15
+ vpermq ymm6, ymm0, 148
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx], ymm6
+ ; 1/15
+ vperm2i128 ymm6, ymm0, ymm1, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+32], ymm6
+ ; 2/15
+ vpermq ymm6, ymm1, 148
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+64], ymm6
+ ; 3/15
+ vperm2i128 ymm6, ymm1, ymm2, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+96], ymm6
+ ; 4/15
+ vpermq ymm6, ymm2, 233
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+128], ymm6
+ ; 5/15
+ vperm2i128 ymm6, ymm2, ymm3, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+160], ymm6
+ ; 6/15
+ vpermq ymm6, ymm3, 233
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+192], ymm6
+ ; 7/15
+ vperm2i128 ymm6, ymm3, ymm4, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+224], ymm6
+ ; 8/15
+ vperm2i128 ymm6, ymm4, ymm5, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+256], ymm6
+ ; 9/15
+ vpermq ymm6, ymm5, 148
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+288], ymm6
+ vmovdqu ymm0, YMMWORD PTR [rcx+192]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ ; 10/15
+ vperm2i128 ymm6, ymm5, ymm0, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+320], ymm6
+ ; 11/15
+ vpermq ymm6, ymm0, 148
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+352], ymm6
+ ; 12/15
+ vperm2i128 ymm6, ymm0, ymm1, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+384], ymm6
+ ; 13/15
+ vpermq ymm6, ymm1, 233
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+416], ymm6
+ ; 14/15
+ vperm2i128 ymm6, ymm1, ymm2, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+448], ymm6
+ ; 15/15
+ vpermq ymm6, ymm2, 233
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+480], ymm6
+ vmovdqu ymm0, YMMWORD PTR [rcx+288]
+ vmovdqu ymm1, YMMWORD PTR [rcx+320]
+ vmovdqu ymm2, YMMWORD PTR [rcx+352]
+ vmovdqu ymm3, YMMWORD PTR [rcx+384]
+ vmovdqu ymm4, YMMWORD PTR [rcx+416]
+ vmovdqu ymm5, YMMWORD PTR [rcx+448]
+ ; 0/15
+ vpermq ymm6, ymm0, 148
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+512], ymm6
+ ; 1/15
+ vperm2i128 ymm6, ymm0, ymm1, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+544], ymm6
+ ; 2/15
+ vpermq ymm6, ymm1, 148
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+576], ymm6
+ ; 3/15
+ vperm2i128 ymm6, ymm1, ymm2, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+608], ymm6
+ ; 4/15
+ vpermq ymm6, ymm2, 233
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+640], ymm6
+ ; 5/15
+ vperm2i128 ymm6, ymm2, ymm3, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+672], ymm6
+ ; 6/15
+ vpermq ymm6, ymm3, 233
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+704], ymm6
+ ; 7/15
+ vperm2i128 ymm6, ymm3, ymm4, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+736], ymm6
+ ; 8/15
+ vperm2i128 ymm6, ymm4, ymm5, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+768], ymm6
+ ; 9/15
+ vpermq ymm6, ymm5, 148
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+800], ymm6
+ vmovdqu ymm0, YMMWORD PTR [rcx+480]
+ vmovdqu ymm1, YMMWORD PTR [rcx+512]
+ vmovdqu ymm2, YMMWORD PTR [rcx+544]
+ ; 10/15
+ vperm2i128 ymm6, ymm5, ymm0, 33
+ vpermq ymm6, ymm6, 148
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+832], ymm6
+ ; 11/15
+ vpermq ymm6, ymm0, 148
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+864], ymm6
+ ; 12/15
+ vperm2i128 ymm6, ymm0, ymm1, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm7
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+896], ymm6
+ ; 13/15
+ vpermq ymm6, ymm1, 233
+ vpshufb ymm6, ymm6, ymm8
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+928], ymm6
+ ; 14/15
+ vperm2i128 ymm6, ymm1, ymm2, 33
+ vpermq ymm6, ymm6, 233
+ vpshufb ymm6, ymm6, ymm9
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+960], ymm6
+ ; 15/15
+ vpermq ymm6, ymm2, 233
+ vpshufb ymm6, ymm6, ymm10
+ vpsrlvd ymm6, ymm6, ymm11
+ vpand ymm6, ymm6, ymm12
+ vpsubd ymm6, ymm13, ymm6
+ vmovdqu YMMWORD PTR [rdx+992], ymm6
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+wc_mldsa_decode_gamma1_17_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_20_avx2_shuff_0 BYTE 00h, 01h, 02h, 0ffh, 02h, 03h, 04h, 0ffh
+ BYTE 05h, 06h, 07h, 0ffh, 07h, 08h, 09h, 0ffh
+ BYTE 0ffh, 02h, 03h, 04h, 0ffh, 04h, 05h, 06h
+ BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh
+ptr_L_mldsa_decode_gamma1_20_avx2_shuff_0 QWORD L_mldsa_decode_gamma1_20_avx2_shuff_0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_20_avx2_shuff_1 BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh
+ BYTE 09h, 0ah, 0bh, 0ffh, 0bh, 0ch, 0dh, 0ffh
+ BYTE 0ffh, 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah
+ BYTE 0ffh, 0bh, 0ch, 0dh, 0ffh, 0dh, 0eh, 0fh
+ptr_L_mldsa_decode_gamma1_20_avx2_shuff_1 QWORD L_mldsa_decode_gamma1_20_avx2_shuff_1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_20_avx2_vs_8 DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ DWORD 00000008h, 0000000ch, 00000008h, 0000000ch
+ptr_L_mldsa_decode_gamma1_20_avx2_vs_8 QWORD L_mldsa_decode_gamma1_20_avx2_vs_8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_20_avx2_mask DWORD 000fffffh, 000fffffh, 000fffffh, 000fffffh
+ DWORD 000fffffh, 000fffffh, 000fffffh, 000fffffh
+ptr_L_mldsa_decode_gamma1_20_avx2_mask QWORD L_mldsa_decode_gamma1_20_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decode_gamma1_20_avx2_gamma19 DWORD 00080000h, 00080000h, 00080000h, 00080000h
+ DWORD 00080000h, 00080000h, 00080000h, 00080000h
+ptr_L_mldsa_decode_gamma1_20_avx2_gamma19 QWORD L_mldsa_decode_gamma1_20_avx2_gamma19
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decode_gamma1_19_avx2 PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_shuff_0
+ vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_shuff_1
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_vs_8
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_mask
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_gamma19
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ ; 0/7
+ vpermq ymm5, ymm0, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm5
+ ; 1/7
+ vperm2i128 ymm5, ymm0, ymm1, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm5
+ ; 2/7
+ vpermq ymm5, ymm1, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm5
+ ; 3/7
+ vperm2i128 ymm5, ymm1, ymm2, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm5
+ ; 4/7
+ vperm2i128 ymm5, ymm2, ymm3, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+128], ymm5
+ ; 5/7
+ vpermq ymm5, ymm3, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ ; 6/7
+ vperm2i128 ymm5, ymm3, ymm4, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+192], ymm5
+ ; 7/7
+ vpermq ymm5, ymm4, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+224], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rcx+160]
+ vmovdqu ymm1, YMMWORD PTR [rcx+192]
+ vmovdqu ymm2, YMMWORD PTR [rcx+224]
+ vmovdqu ymm3, YMMWORD PTR [rcx+256]
+ vmovdqu ymm4, YMMWORD PTR [rcx+288]
+ ; 0/7
+ vpermq ymm5, ymm0, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+256], ymm5
+ ; 1/7
+ vperm2i128 ymm5, ymm0, ymm1, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ ; 2/7
+ vpermq ymm5, ymm1, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+320], ymm5
+ ; 3/7
+ vperm2i128 ymm5, ymm1, ymm2, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+352], ymm5
+ ; 4/7
+ vperm2i128 ymm5, ymm2, ymm3, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+384], ymm5
+ ; 5/7
+ vpermq ymm5, ymm3, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ ; 6/7
+ vperm2i128 ymm5, ymm3, ymm4, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+448], ymm5
+ ; 7/7
+ vpermq ymm5, ymm4, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+480], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rcx+320]
+ vmovdqu ymm1, YMMWORD PTR [rcx+352]
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vmovdqu ymm4, YMMWORD PTR [rcx+448]
+ ; 0/7
+ vpermq ymm5, ymm0, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+512], ymm5
+ ; 1/7
+ vperm2i128 ymm5, ymm0, ymm1, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+544], ymm5
+ ; 2/7
+ vpermq ymm5, ymm1, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+576], ymm5
+ ; 3/7
+ vperm2i128 ymm5, ymm1, ymm2, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+608], ymm5
+ ; 4/7
+ vperm2i128 ymm5, ymm2, ymm3, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+640], ymm5
+ ; 5/7
+ vpermq ymm5, ymm3, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+672], ymm5
+ ; 6/7
+ vperm2i128 ymm5, ymm3, ymm4, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+704], ymm5
+ ; 7/7
+ vpermq ymm5, ymm4, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+736], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rcx+480]
+ vmovdqu ymm1, YMMWORD PTR [rcx+512]
+ vmovdqu ymm2, YMMWORD PTR [rcx+544]
+ vmovdqu ymm3, YMMWORD PTR [rcx+576]
+ vmovdqu ymm4, YMMWORD PTR [rcx+608]
+ ; 0/7
+ vpermq ymm5, ymm0, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+768], ymm5
+ ; 1/7
+ vperm2i128 ymm5, ymm0, ymm1, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+800], ymm5
+ ; 2/7
+ vpermq ymm5, ymm1, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+832], ymm5
+ ; 3/7
+ vperm2i128 ymm5, ymm1, ymm2, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+864], ymm5
+ ; 4/7
+ vperm2i128 ymm5, ymm2, ymm3, 33
+ vpermq ymm5, ymm5, 148
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+896], ymm5
+ ; 5/7
+ vpermq ymm5, ymm3, 148
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+928], ymm5
+ ; 6/7
+ vperm2i128 ymm5, ymm3, ymm4, 33
+ vpermq ymm5, ymm5, 233
+ vpshufb ymm5, ymm5, ymm6
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+960], ymm5
+ ; 7/7
+ vpermq ymm5, ymm4, 233
+ vpshufb ymm5, ymm5, ymm7
+ vpsrlvd ymm5, ymm5, ymm8
+ vpand ymm5, ymm5, ymm9
+ vpsubd ymm5, ymm10, ymm5
+ vmovdqu YMMWORD PTR [rdx+992], ymm5
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+wc_mldsa_decode_gamma1_19_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_17_avx2_gamma17 DWORD 00020000h, 00020000h, 00020000h, 00020000h
+ DWORD 00020000h, 00020000h, 00020000h, 00020000h
+ptr_L_mldsa_encode_gamma1_17_avx2_gamma17 QWORD L_mldsa_encode_gamma1_17_avx2_gamma17
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_17_avx2_shuff_even BYTE 00h, 01h, 02h, 0ffh, 08h, 09h, 0ah, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 00h, 01h, 02h, 0ffh, 08h, 09h, 0ah, 0ffh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_gamma1_17_avx2_shuff_even QWORD L_mldsa_encode_gamma1_17_avx2_shuff_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_17_avx2_shuff_odd BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ch, 0dh
+ BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ch, 0dh
+ BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_gamma1_17_avx2_shuff_odd QWORD L_mldsa_encode_gamma1_17_avx2_shuff_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_17_avx2_vs DWORD 00000000h, 00000002h, 00000004h, 00000006h
+ DWORD 00000000h, 00000002h, 00000004h, 00000006h
+ptr_L_mldsa_encode_gamma1_17_avx2_vs QWORD L_mldsa_encode_gamma1_17_avx2_vs
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_encode_gamma1_17_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_gamma17
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_shuff_even
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_shuff_odd
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_vs
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrb BYTE PTR [rdx+8], xmm0, 8
+ movq QWORD PTR [rdx+9], xmm4
+ vpextrb BYTE PTR [rdx+17], xmm4, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm1
+ vpextrb BYTE PTR [rdx+8], xmm1, 8
+ movq QWORD PTR [rdx+9], xmm5
+ vpextrb BYTE PTR [rdx+17], xmm5, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm2
+ vpextrb BYTE PTR [rdx+8], xmm2, 8
+ movq QWORD PTR [rdx+9], xmm6
+ vpextrb BYTE PTR [rdx+17], xmm6, 8
+ add rdx, 18
+ movq QWORD PTR [rdx], xmm3
+ vpextrb BYTE PTR [rdx+8], xmm3, 8
+ movq QWORD PTR [rdx+9], xmm7
+ vpextrb BYTE PTR [rdx+17], xmm7, 8
+ add rdx, 18
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+wc_mldsa_encode_gamma1_17_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_19_avx2_gamma19 DWORD 00080000h, 00080000h, 00080000h, 00080000h
+ DWORD 00080000h, 00080000h, 00080000h, 00080000h
+ptr_L_mldsa_encode_gamma1_19_avx2_gamma19 QWORD L_mldsa_encode_gamma1_19_avx2_gamma19
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_19_avx2_shuff_even BYTE 00h, 01h, 02h, 0ffh, 0ffh, 08h, 09h, 0ah
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 00h, 01h, 02h, 0ffh, 0ffh, 08h, 09h, 0ah
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_gamma1_19_avx2_shuff_even QWORD L_mldsa_encode_gamma1_19_avx2_shuff_even
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_19_avx2_shuff_odd BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ffh, 0ch
+ BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ffh, 0ch
+ BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ptr_L_mldsa_encode_gamma1_19_avx2_shuff_odd QWORD L_mldsa_encode_gamma1_19_avx2_shuff_odd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_encode_gamma1_19_avx2_vs DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ DWORD 00000000h, 00000004h, 00000000h, 00000004h
+ptr_L_mldsa_encode_gamma1_19_avx2_vs QWORD L_mldsa_encode_gamma1_19_avx2_vs
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_encode_gamma1_19_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_gamma19
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_shuff_even
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_shuff_odd
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_vs
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpsubd ymm0, ymm8, ymm0
+ vpsubd ymm1, ymm8, ymm1
+ vpsubd ymm2, ymm8, ymm2
+ vpsubd ymm3, ymm8, ymm3
+ vpsllvd ymm0, ymm0, ymm11
+ vpsllvd ymm1, ymm1, ymm11
+ vpsllvd ymm2, ymm2, ymm11
+ vpsllvd ymm3, ymm3, ymm11
+ vpshufb ymm4, ymm0, ymm10
+ vpshufb ymm5, ymm1, ymm10
+ vpshufb ymm6, ymm2, ymm10
+ vpshufb ymm7, ymm3, ymm10
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpshufb ymm2, ymm2, ymm9
+ vpshufb ymm3, ymm3, ymm9
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm5
+ vpor ymm2, ymm2, ymm6
+ vpor ymm3, ymm3, ymm7
+ vextracti128 xmm4, ymm0, 1
+ vextracti128 xmm5, ymm1, 1
+ vextracti128 xmm6, ymm2, 1
+ vextracti128 xmm7, ymm3, 1
+ movq QWORD PTR [rdx], xmm0
+ vpextrw WORD PTR [rdx+8], xmm0, 4
+ movq QWORD PTR [rdx+10], xmm4
+ vpextrw WORD PTR [rdx+18], xmm4, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm1
+ vpextrw WORD PTR [rdx+8], xmm1, 4
+ movq QWORD PTR [rdx+10], xmm5
+ vpextrw WORD PTR [rdx+18], xmm5, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm2
+ vpextrw WORD PTR [rdx+8], xmm2, 4
+ movq QWORD PTR [rdx+10], xmm6
+ vpextrw WORD PTR [rdx+18], xmm6, 4
+ add rdx, 20
+ movq QWORD PTR [rdx], xmm3
+ vpextrw WORD PTR [rdx+8], xmm3, 4
+ movq QWORD PTR [rdx+10], xmm7
+ vpextrw WORD PTR [rdx+18], xmm7, 4
+ add rdx, 20
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+wc_mldsa_encode_gamma1_19_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q88_avx2_q_low_88 DWORD 00017400h, 00017400h, 00017400h, 00017400h
+ DWORD 00017400h, 00017400h, 00017400h, 00017400h
+ptr_L_mldsa_decompose_q88_avx2_q_low_88 QWORD L_mldsa_decompose_q88_avx2_q_low_88
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q88_avx2_q_low_88_2 DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h
+ DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h
+ptr_L_mldsa_decompose_q88_avx2_q_low_88_2 QWORD L_mldsa_decompose_q88_avx2_q_low_88_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q88_avx2_q_2 DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h
+ DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h
+ptr_L_mldsa_decompose_q88_avx2_q_2 QWORD L_mldsa_decompose_q88_avx2_q_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q88_avx2_44 DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch
+ DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch
+ptr_L_mldsa_decompose_q88_avx2_44 QWORD L_mldsa_decompose_q88_avx2_44
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decompose_q88_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_low_88
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_low_88_2
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_2
+ vmovdqu ymm15, YMMWORD PTR L_mldsa_decompose_q88_avx2_44
+ ; 1/4 vectors
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx], ymm4
+ vmovdqu YMMWORD PTR [rdx+32], ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm6
+ vmovdqu YMMWORD PTR [rdx+96], ymm7
+ vmovdqu YMMWORD PTR [r8], ymm8
+ vmovdqu YMMWORD PTR [r8+32], ymm9
+ vmovdqu YMMWORD PTR [r8+64], ymm10
+ vmovdqu YMMWORD PTR [r8+96], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu YMMWORD PTR [rdx+192], ymm6
+ vmovdqu YMMWORD PTR [rdx+224], ymm7
+ vmovdqu YMMWORD PTR [r8+128], ymm8
+ vmovdqu YMMWORD PTR [r8+160], ymm9
+ vmovdqu YMMWORD PTR [r8+192], ymm10
+ vmovdqu YMMWORD PTR [r8+224], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ vmovdqu YMMWORD PTR [rdx+320], ymm6
+ vmovdqu YMMWORD PTR [rdx+352], ymm7
+ vmovdqu YMMWORD PTR [r8+256], ymm8
+ vmovdqu YMMWORD PTR [r8+288], ymm9
+ vmovdqu YMMWORD PTR [r8+320], ymm10
+ vmovdqu YMMWORD PTR [r8+352], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu YMMWORD PTR [rdx+448], ymm6
+ vmovdqu YMMWORD PTR [rdx+480], ymm7
+ vmovdqu YMMWORD PTR [r8+384], ymm8
+ vmovdqu YMMWORD PTR [r8+416], ymm9
+ vmovdqu YMMWORD PTR [r8+448], ymm10
+ vmovdqu YMMWORD PTR [r8+480], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+512], ymm4
+ vmovdqu YMMWORD PTR [rdx+544], ymm5
+ vmovdqu YMMWORD PTR [rdx+576], ymm6
+ vmovdqu YMMWORD PTR [rdx+608], ymm7
+ vmovdqu YMMWORD PTR [r8+512], ymm8
+ vmovdqu YMMWORD PTR [r8+544], ymm9
+ vmovdqu YMMWORD PTR [r8+576], ymm10
+ vmovdqu YMMWORD PTR [r8+608], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ vmovdqu YMMWORD PTR [rdx+672], ymm5
+ vmovdqu YMMWORD PTR [rdx+704], ymm6
+ vmovdqu YMMWORD PTR [rdx+736], ymm7
+ vmovdqu YMMWORD PTR [r8+640], ymm8
+ vmovdqu YMMWORD PTR [r8+672], ymm9
+ vmovdqu YMMWORD PTR [r8+704], ymm10
+ vmovdqu YMMWORD PTR [r8+736], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+768], ymm4
+ vmovdqu YMMWORD PTR [rdx+800], ymm5
+ vmovdqu YMMWORD PTR [rdx+832], ymm6
+ vmovdqu YMMWORD PTR [rdx+864], ymm7
+ vmovdqu YMMWORD PTR [r8+768], ymm8
+ vmovdqu YMMWORD PTR [r8+800], ymm9
+ vmovdqu YMMWORD PTR [r8+832], ymm10
+ vmovdqu YMMWORD PTR [r8+864], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ vmovdqu YMMWORD PTR [rdx+928], ymm5
+ vmovdqu YMMWORD PTR [rdx+960], ymm6
+ vmovdqu YMMWORD PTR [rdx+992], ymm7
+ vmovdqu YMMWORD PTR [r8+896], ymm8
+ vmovdqu YMMWORD PTR [r8+928], ymm9
+ vmovdqu YMMWORD PTR [r8+960], ymm10
+ vmovdqu YMMWORD PTR [r8+992], ymm11
+ ; 2/4 vectors
+ vmovdqu ymm0, YMMWORD PTR [rcx+1024]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1056]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1088]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1120]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1024], ymm4
+ vmovdqu YMMWORD PTR [rdx+1056], ymm5
+ vmovdqu YMMWORD PTR [rdx+1088], ymm6
+ vmovdqu YMMWORD PTR [rdx+1120], ymm7
+ vmovdqu YMMWORD PTR [r8+1024], ymm8
+ vmovdqu YMMWORD PTR [r8+1056], ymm9
+ vmovdqu YMMWORD PTR [r8+1088], ymm10
+ vmovdqu YMMWORD PTR [r8+1120], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1152]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1184]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1216]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1248]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1152], ymm4
+ vmovdqu YMMWORD PTR [rdx+1184], ymm5
+ vmovdqu YMMWORD PTR [rdx+1216], ymm6
+ vmovdqu YMMWORD PTR [rdx+1248], ymm7
+ vmovdqu YMMWORD PTR [r8+1152], ymm8
+ vmovdqu YMMWORD PTR [r8+1184], ymm9
+ vmovdqu YMMWORD PTR [r8+1216], ymm10
+ vmovdqu YMMWORD PTR [r8+1248], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1280]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1312]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1344]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1376]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1280], ymm4
+ vmovdqu YMMWORD PTR [rdx+1312], ymm5
+ vmovdqu YMMWORD PTR [rdx+1344], ymm6
+ vmovdqu YMMWORD PTR [rdx+1376], ymm7
+ vmovdqu YMMWORD PTR [r8+1280], ymm8
+ vmovdqu YMMWORD PTR [r8+1312], ymm9
+ vmovdqu YMMWORD PTR [r8+1344], ymm10
+ vmovdqu YMMWORD PTR [r8+1376], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1408]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1440]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1472]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1504]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1408], ymm4
+ vmovdqu YMMWORD PTR [rdx+1440], ymm5
+ vmovdqu YMMWORD PTR [rdx+1472], ymm6
+ vmovdqu YMMWORD PTR [rdx+1504], ymm7
+ vmovdqu YMMWORD PTR [r8+1408], ymm8
+ vmovdqu YMMWORD PTR [r8+1440], ymm9
+ vmovdqu YMMWORD PTR [r8+1472], ymm10
+ vmovdqu YMMWORD PTR [r8+1504], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1536]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1568]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1600]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1632]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1536], ymm4
+ vmovdqu YMMWORD PTR [rdx+1568], ymm5
+ vmovdqu YMMWORD PTR [rdx+1600], ymm6
+ vmovdqu YMMWORD PTR [rdx+1632], ymm7
+ vmovdqu YMMWORD PTR [r8+1536], ymm8
+ vmovdqu YMMWORD PTR [r8+1568], ymm9
+ vmovdqu YMMWORD PTR [r8+1600], ymm10
+ vmovdqu YMMWORD PTR [r8+1632], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1664]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1696]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1728]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1760]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1664], ymm4
+ vmovdqu YMMWORD PTR [rdx+1696], ymm5
+ vmovdqu YMMWORD PTR [rdx+1728], ymm6
+ vmovdqu YMMWORD PTR [rdx+1760], ymm7
+ vmovdqu YMMWORD PTR [r8+1664], ymm8
+ vmovdqu YMMWORD PTR [r8+1696], ymm9
+ vmovdqu YMMWORD PTR [r8+1728], ymm10
+ vmovdqu YMMWORD PTR [r8+1760], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1792]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1824]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1856]
+ vmovdqu ymm3, YMMWORD PTR [rcx+1888]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1792], ymm4
+ vmovdqu YMMWORD PTR [rdx+1824], ymm5
+ vmovdqu YMMWORD PTR [rdx+1856], ymm6
+ vmovdqu YMMWORD PTR [rdx+1888], ymm7
+ vmovdqu YMMWORD PTR [r8+1792], ymm8
+ vmovdqu YMMWORD PTR [r8+1824], ymm9
+ vmovdqu YMMWORD PTR [r8+1856], ymm10
+ vmovdqu YMMWORD PTR [r8+1888], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+1920]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1952]
+ vmovdqu ymm2, YMMWORD PTR [rcx+1984]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2016]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+1920], ymm4
+ vmovdqu YMMWORD PTR [rdx+1952], ymm5
+ vmovdqu YMMWORD PTR [rdx+1984], ymm6
+ vmovdqu YMMWORD PTR [rdx+2016], ymm7
+ vmovdqu YMMWORD PTR [r8+1920], ymm8
+ vmovdqu YMMWORD PTR [r8+1952], ymm9
+ vmovdqu YMMWORD PTR [r8+1984], ymm10
+ vmovdqu YMMWORD PTR [r8+2016], ymm11
+ ; 3/4 vectors
+ vmovdqu ymm0, YMMWORD PTR [rcx+2048]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2080]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2112]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2144]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2048], ymm4
+ vmovdqu YMMWORD PTR [rdx+2080], ymm5
+ vmovdqu YMMWORD PTR [rdx+2112], ymm6
+ vmovdqu YMMWORD PTR [rdx+2144], ymm7
+ vmovdqu YMMWORD PTR [r8+2048], ymm8
+ vmovdqu YMMWORD PTR [r8+2080], ymm9
+ vmovdqu YMMWORD PTR [r8+2112], ymm10
+ vmovdqu YMMWORD PTR [r8+2144], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2176]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2208]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2240]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2272]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2176], ymm4
+ vmovdqu YMMWORD PTR [rdx+2208], ymm5
+ vmovdqu YMMWORD PTR [rdx+2240], ymm6
+ vmovdqu YMMWORD PTR [rdx+2272], ymm7
+ vmovdqu YMMWORD PTR [r8+2176], ymm8
+ vmovdqu YMMWORD PTR [r8+2208], ymm9
+ vmovdqu YMMWORD PTR [r8+2240], ymm10
+ vmovdqu YMMWORD PTR [r8+2272], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2304]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2336]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2368]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2400]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2304], ymm4
+ vmovdqu YMMWORD PTR [rdx+2336], ymm5
+ vmovdqu YMMWORD PTR [rdx+2368], ymm6
+ vmovdqu YMMWORD PTR [rdx+2400], ymm7
+ vmovdqu YMMWORD PTR [r8+2304], ymm8
+ vmovdqu YMMWORD PTR [r8+2336], ymm9
+ vmovdqu YMMWORD PTR [r8+2368], ymm10
+ vmovdqu YMMWORD PTR [r8+2400], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2432]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2464]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2496]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2528]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2432], ymm4
+ vmovdqu YMMWORD PTR [rdx+2464], ymm5
+ vmovdqu YMMWORD PTR [rdx+2496], ymm6
+ vmovdqu YMMWORD PTR [rdx+2528], ymm7
+ vmovdqu YMMWORD PTR [r8+2432], ymm8
+ vmovdqu YMMWORD PTR [r8+2464], ymm9
+ vmovdqu YMMWORD PTR [r8+2496], ymm10
+ vmovdqu YMMWORD PTR [r8+2528], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2560]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2592]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2624]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2656]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2560], ymm4
+ vmovdqu YMMWORD PTR [rdx+2592], ymm5
+ vmovdqu YMMWORD PTR [rdx+2624], ymm6
+ vmovdqu YMMWORD PTR [rdx+2656], ymm7
+ vmovdqu YMMWORD PTR [r8+2560], ymm8
+ vmovdqu YMMWORD PTR [r8+2592], ymm9
+ vmovdqu YMMWORD PTR [r8+2624], ymm10
+ vmovdqu YMMWORD PTR [r8+2656], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2688]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2720]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2752]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2784]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2688], ymm4
+ vmovdqu YMMWORD PTR [rdx+2720], ymm5
+ vmovdqu YMMWORD PTR [rdx+2752], ymm6
+ vmovdqu YMMWORD PTR [rdx+2784], ymm7
+ vmovdqu YMMWORD PTR [r8+2688], ymm8
+ vmovdqu YMMWORD PTR [r8+2720], ymm9
+ vmovdqu YMMWORD PTR [r8+2752], ymm10
+ vmovdqu YMMWORD PTR [r8+2784], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2816]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2848]
+ vmovdqu ymm2, YMMWORD PTR [rcx+2880]
+ vmovdqu ymm3, YMMWORD PTR [rcx+2912]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2816], ymm4
+ vmovdqu YMMWORD PTR [rdx+2848], ymm5
+ vmovdqu YMMWORD PTR [rdx+2880], ymm6
+ vmovdqu YMMWORD PTR [rdx+2912], ymm7
+ vmovdqu YMMWORD PTR [r8+2816], ymm8
+ vmovdqu YMMWORD PTR [r8+2848], ymm9
+ vmovdqu YMMWORD PTR [r8+2880], ymm10
+ vmovdqu YMMWORD PTR [r8+2912], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+2944]
+ vmovdqu ymm1, YMMWORD PTR [rcx+2976]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3008]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3040]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+2944], ymm4
+ vmovdqu YMMWORD PTR [rdx+2976], ymm5
+ vmovdqu YMMWORD PTR [rdx+3008], ymm6
+ vmovdqu YMMWORD PTR [rdx+3040], ymm7
+ vmovdqu YMMWORD PTR [r8+2944], ymm8
+ vmovdqu YMMWORD PTR [r8+2976], ymm9
+ vmovdqu YMMWORD PTR [r8+3008], ymm10
+ vmovdqu YMMWORD PTR [r8+3040], ymm11
+ ; 4/4 vectors
+ vmovdqu ymm0, YMMWORD PTR [rcx+3072]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3104]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3136]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3168]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3072], ymm4
+ vmovdqu YMMWORD PTR [rdx+3104], ymm5
+ vmovdqu YMMWORD PTR [rdx+3136], ymm6
+ vmovdqu YMMWORD PTR [rdx+3168], ymm7
+ vmovdqu YMMWORD PTR [r8+3072], ymm8
+ vmovdqu YMMWORD PTR [r8+3104], ymm9
+ vmovdqu YMMWORD PTR [r8+3136], ymm10
+ vmovdqu YMMWORD PTR [r8+3168], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3200]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3232]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3264]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3296]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3200], ymm4
+ vmovdqu YMMWORD PTR [rdx+3232], ymm5
+ vmovdqu YMMWORD PTR [rdx+3264], ymm6
+ vmovdqu YMMWORD PTR [rdx+3296], ymm7
+ vmovdqu YMMWORD PTR [r8+3200], ymm8
+ vmovdqu YMMWORD PTR [r8+3232], ymm9
+ vmovdqu YMMWORD PTR [r8+3264], ymm10
+ vmovdqu YMMWORD PTR [r8+3296], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3328]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3360]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3392]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3424]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3328], ymm4
+ vmovdqu YMMWORD PTR [rdx+3360], ymm5
+ vmovdqu YMMWORD PTR [rdx+3392], ymm6
+ vmovdqu YMMWORD PTR [rdx+3424], ymm7
+ vmovdqu YMMWORD PTR [r8+3328], ymm8
+ vmovdqu YMMWORD PTR [r8+3360], ymm9
+ vmovdqu YMMWORD PTR [r8+3392], ymm10
+ vmovdqu YMMWORD PTR [r8+3424], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3456]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3488]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3520]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3552]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3456], ymm4
+ vmovdqu YMMWORD PTR [rdx+3488], ymm5
+ vmovdqu YMMWORD PTR [rdx+3520], ymm6
+ vmovdqu YMMWORD PTR [rdx+3552], ymm7
+ vmovdqu YMMWORD PTR [r8+3456], ymm8
+ vmovdqu YMMWORD PTR [r8+3488], ymm9
+ vmovdqu YMMWORD PTR [r8+3520], ymm10
+ vmovdqu YMMWORD PTR [r8+3552], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3584]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3616]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3648]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3680]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3584], ymm4
+ vmovdqu YMMWORD PTR [rdx+3616], ymm5
+ vmovdqu YMMWORD PTR [rdx+3648], ymm6
+ vmovdqu YMMWORD PTR [rdx+3680], ymm7
+ vmovdqu YMMWORD PTR [r8+3584], ymm8
+ vmovdqu YMMWORD PTR [r8+3616], ymm9
+ vmovdqu YMMWORD PTR [r8+3648], ymm10
+ vmovdqu YMMWORD PTR [r8+3680], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3712]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3744]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3776]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3808]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3712], ymm4
+ vmovdqu YMMWORD PTR [rdx+3744], ymm5
+ vmovdqu YMMWORD PTR [rdx+3776], ymm6
+ vmovdqu YMMWORD PTR [rdx+3808], ymm7
+ vmovdqu YMMWORD PTR [r8+3712], ymm8
+ vmovdqu YMMWORD PTR [r8+3744], ymm9
+ vmovdqu YMMWORD PTR [r8+3776], ymm10
+ vmovdqu YMMWORD PTR [r8+3808], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3840]
+ vmovdqu ymm1, YMMWORD PTR [rcx+3872]
+ vmovdqu ymm2, YMMWORD PTR [rcx+3904]
+ vmovdqu ymm3, YMMWORD PTR [rcx+3936]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3840], ymm4
+ vmovdqu YMMWORD PTR [rdx+3872], ymm5
+ vmovdqu YMMWORD PTR [rdx+3904], ymm6
+ vmovdqu YMMWORD PTR [rdx+3936], ymm7
+ vmovdqu YMMWORD PTR [r8+3840], ymm8
+ vmovdqu YMMWORD PTR [r8+3872], ymm9
+ vmovdqu YMMWORD PTR [r8+3904], ymm10
+ vmovdqu YMMWORD PTR [r8+3936], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+3968]
+ vmovdqu ymm1, YMMWORD PTR [rcx+4000]
+ vmovdqu ymm2, YMMWORD PTR [rcx+4032]
+ vmovdqu ymm3, YMMWORD PTR [rcx+4064]
+ vpmulld ymm8, ymm0, ymm15
+ vpmulld ymm9, ymm1, ymm15
+ vpmulld ymm10, ymm2, ymm15
+ vpmulld ymm11, ymm3, ymm15
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm9, ymm9, ymm14
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm14
+ vpsrld ymm8, ymm8, 23
+ vpsrld ymm9, ymm9, 23
+ vpsrld ymm10, ymm10, 23
+ vpsrld ymm11, ymm11, 23
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpcmpeqd ymm0, ymm8, ymm15
+ vpcmpeqd ymm1, ymm9, ymm15
+ vpcmpeqd ymm2, ymm10, ymm15
+ vpcmpeqd ymm3, ymm11, ymm15
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vpaddd ymm6, ymm6, ymm2
+ vpaddd ymm7, ymm7, ymm3
+ vpcmpgtd ymm0, ymm15, ymm8
+ vpcmpgtd ymm1, ymm15, ymm9
+ vpcmpgtd ymm2, ymm15, ymm10
+ vpcmpgtd ymm3, ymm15, ymm11
+ vpand ymm8, ymm8, ymm0
+ vpand ymm9, ymm9, ymm1
+ vpand ymm10, ymm10, ymm2
+ vpand ymm11, ymm11, ymm3
+ vmovdqu YMMWORD PTR [rdx+3968], ymm4
+ vmovdqu YMMWORD PTR [rdx+4000], ymm5
+ vmovdqu YMMWORD PTR [rdx+4032], ymm6
+ vmovdqu YMMWORD PTR [rdx+4064], ymm7
+ vmovdqu YMMWORD PTR [r8+3968], ymm8
+ vmovdqu YMMWORD PTR [r8+4000], ymm9
+ vmovdqu YMMWORD PTR [r8+4032], ymm10
+ vmovdqu YMMWORD PTR [r8+4064], ymm11
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_decompose_q88_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q32_avx2_q_low_32 DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h
+ DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h
+ptr_L_mldsa_decompose_q32_avx2_q_low_32 QWORD L_mldsa_decompose_q32_avx2_q_low_32
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q32_avx2_q_low_32_2 DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h
+ DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h
+ptr_L_mldsa_decompose_q32_avx2_q_low_32_2 QWORD L_mldsa_decompose_q32_avx2_q_low_32_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q32_avx2_q_low_32_m1 DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh
+ DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh
+ptr_L_mldsa_decompose_q32_avx2_q_low_32_m1 QWORD L_mldsa_decompose_q32_avx2_q_low_32_m1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_decompose_q32_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ptr_L_mldsa_decompose_q32_avx2_mask QWORD L_mldsa_decompose_q32_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_decompose_q32_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32_2
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32_m1
+ vmovdqu ymm15, YMMWORD PTR L_mldsa_decompose_q32_avx2_mask
+L_mldsa_decompose_q32_avx2_start_256:
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8], ymm4
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r8+64], ymm6
+ vmovdqu YMMWORD PTR [r8+96], ymm7
+ vmovdqu YMMWORD PTR [r9], ymm8
+ vmovdqu YMMWORD PTR [r9+32], ymm9
+ vmovdqu YMMWORD PTR [r9+64], ymm10
+ vmovdqu YMMWORD PTR [r9+96], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ vmovdqu YMMWORD PTR [r8+160], ymm5
+ vmovdqu YMMWORD PTR [r8+192], ymm6
+ vmovdqu YMMWORD PTR [r8+224], ymm7
+ vmovdqu YMMWORD PTR [r9+128], ymm8
+ vmovdqu YMMWORD PTR [r9+160], ymm9
+ vmovdqu YMMWORD PTR [r9+192], ymm10
+ vmovdqu YMMWORD PTR [r9+224], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+256], ymm4
+ vmovdqu YMMWORD PTR [r8+288], ymm5
+ vmovdqu YMMWORD PTR [r8+320], ymm6
+ vmovdqu YMMWORD PTR [r8+352], ymm7
+ vmovdqu YMMWORD PTR [r9+256], ymm8
+ vmovdqu YMMWORD PTR [r9+288], ymm9
+ vmovdqu YMMWORD PTR [r9+320], ymm10
+ vmovdqu YMMWORD PTR [r9+352], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+384], ymm4
+ vmovdqu YMMWORD PTR [r8+416], ymm5
+ vmovdqu YMMWORD PTR [r8+448], ymm6
+ vmovdqu YMMWORD PTR [r8+480], ymm7
+ vmovdqu YMMWORD PTR [r9+384], ymm8
+ vmovdqu YMMWORD PTR [r9+416], ymm9
+ vmovdqu YMMWORD PTR [r9+448], ymm10
+ vmovdqu YMMWORD PTR [r9+480], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+512], ymm4
+ vmovdqu YMMWORD PTR [r8+544], ymm5
+ vmovdqu YMMWORD PTR [r8+576], ymm6
+ vmovdqu YMMWORD PTR [r8+608], ymm7
+ vmovdqu YMMWORD PTR [r9+512], ymm8
+ vmovdqu YMMWORD PTR [r9+544], ymm9
+ vmovdqu YMMWORD PTR [r9+576], ymm10
+ vmovdqu YMMWORD PTR [r9+608], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+640], ymm4
+ vmovdqu YMMWORD PTR [r8+672], ymm5
+ vmovdqu YMMWORD PTR [r8+704], ymm6
+ vmovdqu YMMWORD PTR [r8+736], ymm7
+ vmovdqu YMMWORD PTR [r9+640], ymm8
+ vmovdqu YMMWORD PTR [r9+672], ymm9
+ vmovdqu YMMWORD PTR [r9+704], ymm10
+ vmovdqu YMMWORD PTR [r9+736], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+768], ymm4
+ vmovdqu YMMWORD PTR [r8+800], ymm5
+ vmovdqu YMMWORD PTR [r8+832], ymm6
+ vmovdqu YMMWORD PTR [r8+864], ymm7
+ vmovdqu YMMWORD PTR [r9+768], ymm8
+ vmovdqu YMMWORD PTR [r9+800], ymm9
+ vmovdqu YMMWORD PTR [r9+832], ymm10
+ vmovdqu YMMWORD PTR [r9+864], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpaddd ymm8, ymm0, ymm14
+ vpaddd ymm9, ymm1, ymm14
+ vpaddd ymm10, ymm2, ymm14
+ vpaddd ymm11, ymm3, ymm14
+ vpsrld ymm8, ymm8, 19
+ vpsrld ymm9, ymm9, 19
+ vpsrld ymm10, ymm10, 19
+ vpsrld ymm11, ymm11, 19
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsubd ymm4, ymm12, ymm4
+ vpsubd ymm5, ymm12, ymm5
+ vpsubd ymm6, ymm12, ymm6
+ vpsubd ymm7, ymm12, ymm7
+ vpsrld ymm4, ymm4, 31
+ vpsrld ymm5, ymm5, 31
+ vpsrld ymm6, ymm6, 31
+ vpsrld ymm7, ymm7, 31
+ vpaddd ymm8, ymm8, ymm4
+ vpaddd ymm9, ymm9, ymm5
+ vpaddd ymm10, ymm10, ymm6
+ vpaddd ymm11, ymm11, ymm7
+ vpmulld ymm4, ymm8, ymm13
+ vpmulld ymm5, ymm9, ymm13
+ vpmulld ymm6, ymm10, ymm13
+ vpmulld ymm7, ymm11, ymm13
+ vpsubd ymm4, ymm0, ymm4
+ vpsubd ymm5, ymm1, ymm5
+ vpsubd ymm6, ymm2, ymm6
+ vpsubd ymm7, ymm3, ymm7
+ vpsrld ymm0, ymm8, 4
+ vpsrld ymm1, ymm9, 4
+ vpsrld ymm2, ymm10, 4
+ vpsrld ymm3, ymm11, 4
+ vpsubd ymm4, ymm4, ymm0
+ vpsubd ymm5, ymm5, ymm1
+ vpsubd ymm6, ymm6, ymm2
+ vpsubd ymm7, ymm7, ymm3
+ vpand ymm8, ymm8, ymm15
+ vpand ymm9, ymm9, ymm15
+ vpand ymm10, ymm10, ymm15
+ vpand ymm11, ymm11, ymm15
+ vmovdqu YMMWORD PTR [r8+896], ymm4
+ vmovdqu YMMWORD PTR [r8+928], ymm5
+ vmovdqu YMMWORD PTR [r8+960], ymm6
+ vmovdqu YMMWORD PTR [r8+992], ymm7
+ vmovdqu YMMWORD PTR [r9+896], ymm8
+ vmovdqu YMMWORD PTR [r9+928], ymm9
+ vmovdqu YMMWORD PTR [r9+960], ymm10
+ vmovdqu YMMWORD PTR [r9+992], ymm11
+ add rcx, 1024
+ add r8, 1024
+ add r9, 1024
+ sub rdx, 1
+ jne L_mldsa_decompose_q32_avx2_start_256
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_decompose_q32_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ptr_L_mldsa_use_hint_88_avx2_q QWORD L_mldsa_use_hint_88_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_q_low_88 DWORD 00017400h, 00017400h, 00017400h, 00017400h
+ DWORD 00017400h, 00017400h, 00017400h, 00017400h
+ptr_L_mldsa_use_hint_88_avx2_q_low_88 QWORD L_mldsa_use_hint_88_avx2_q_low_88
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_q_low_88_2 DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h
+ DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h
+ptr_L_mldsa_use_hint_88_avx2_q_low_88_2 QWORD L_mldsa_use_hint_88_avx2_q_low_88_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_q_2 DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h
+ DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h
+ptr_L_mldsa_use_hint_88_avx2_q_2 QWORD L_mldsa_use_hint_88_avx2_q_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_44 DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch
+ DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch
+ptr_L_mldsa_use_hint_88_avx2_44 QWORD L_mldsa_use_hint_88_avx2_44
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_vsl DWORD 0000001fh, 0000001eh, 0000001dh, 0000001ch
+ DWORD 0000001bh, 0000001ah, 00000019h, 00000018h
+ptr_L_mldsa_use_hint_88_avx2_vsl QWORD L_mldsa_use_hint_88_avx2_vsl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_88_avx2_one DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ptr_L_mldsa_use_hint_88_avx2_one QWORD L_mldsa_use_hint_88_avx2_one
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_use_hint_88_avx2 PROC
+ push r12
+ push r13
+ mov rax, rdx
+ mov rdx, rcx
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_use_hint_88_avx2_q
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_low_88
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_low_88_2
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_2
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_use_hint_88_avx2_44
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_use_hint_88_avx2_vsl
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_use_hint_88_avx2_one
+ xor r9, r9
+ mov r10b, [rax]
+ ; 1/4 vectors
+ mov r8b, [rax+80]
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_0:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_0
+ mov cl, r10b
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_0
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_0
+L_mldsa_use_hint_88_avx2_hints_done_0_0:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx], ymm4
+ vmovdqu YMMWORD PTR [rdx+32], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm1, YMMWORD PTR [rdx+96]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_1:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_1
+ mov cl, r10b
+ sub cl, 16
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_1
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_1
+L_mldsa_use_hint_88_avx2_hints_done_0_1:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm4
+ vmovdqu YMMWORD PTR [rdx+96], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_2:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_2
+ mov cl, r10b
+ sub cl, 32
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_2
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_2
+L_mldsa_use_hint_88_avx2_hints_done_0_2:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm1, YMMWORD PTR [rdx+224]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_3:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_3
+ mov cl, r10b
+ sub cl, 48
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_3
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_3
+L_mldsa_use_hint_88_avx2_hints_done_0_3:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+192], ymm4
+ vmovdqu YMMWORD PTR [rdx+224], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_4:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_4
+ mov cl, r10b
+ sub cl, 64
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_4
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_4
+L_mldsa_use_hint_88_avx2_hints_done_0_4:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm1, YMMWORD PTR [rdx+352]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_5:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_5
+ mov cl, r10b
+ sub cl, 80
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_5
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_5
+L_mldsa_use_hint_88_avx2_hints_done_0_5:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm4
+ vmovdqu YMMWORD PTR [rdx+352], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm1, YMMWORD PTR [rdx+416]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_6:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_6
+ mov cl, r10b
+ sub cl, 96
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_6
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_6
+L_mldsa_use_hint_88_avx2_hints_done_0_6:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm1, YMMWORD PTR [rdx+480]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_7:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_7
+ mov cl, r10b
+ sub cl, 112
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_7
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_7
+L_mldsa_use_hint_88_avx2_hints_done_0_7:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+448], ymm4
+ vmovdqu YMMWORD PTR [rdx+480], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm1, YMMWORD PTR [rdx+544]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_8:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_8
+ mov cl, r10b
+ sub cl, 128
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_8
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_8
+L_mldsa_use_hint_88_avx2_hints_done_0_8:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+512], ymm4
+ vmovdqu YMMWORD PTR [rdx+544], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm1, YMMWORD PTR [rdx+608]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_9:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_9
+ mov cl, r10b
+ sub cl, 144
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_9
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_9
+L_mldsa_use_hint_88_avx2_hints_done_0_9:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+576], ymm4
+ vmovdqu YMMWORD PTR [rdx+608], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm1, YMMWORD PTR [rdx+672]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_10:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_10
+ mov cl, r10b
+ sub cl, 160
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_10
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_10
+L_mldsa_use_hint_88_avx2_hints_done_0_10:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ vmovdqu YMMWORD PTR [rdx+672], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm1, YMMWORD PTR [rdx+736]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_11:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_11
+ mov cl, r10b
+ sub cl, 176
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_11
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_11
+L_mldsa_use_hint_88_avx2_hints_done_0_11:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+704], ymm4
+ vmovdqu YMMWORD PTR [rdx+736], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm1, YMMWORD PTR [rdx+800]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_12:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_12
+ mov cl, r10b
+ sub cl, 192
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_12
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_12
+L_mldsa_use_hint_88_avx2_hints_done_0_12:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+768], ymm4
+ vmovdqu YMMWORD PTR [rdx+800], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm1, YMMWORD PTR [rdx+864]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_13:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_13
+ mov cl, r10b
+ sub cl, 208
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_13
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_13
+L_mldsa_use_hint_88_avx2_hints_done_0_13:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+832], ymm4
+ vmovdqu YMMWORD PTR [rdx+864], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm1, YMMWORD PTR [rdx+928]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_14:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_14
+ mov cl, r10b
+ sub cl, 224
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_14
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_14
+L_mldsa_use_hint_88_avx2_hints_done_0_14:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ vmovdqu YMMWORD PTR [rdx+928], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm1, YMMWORD PTR [rdx+992]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_0_15:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_15
+ mov cl, r10b
+ sub cl, 240
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_0_15
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_0_15
+L_mldsa_use_hint_88_avx2_hints_done_0_15:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+960], ymm4
+ vmovdqu YMMWORD PTR [rdx+992], ymm5
+ ; 2/4 vectors
+ mov r8b, [rax+81]
+ vmovdqu ymm0, YMMWORD PTR [rdx+1024]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1056]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_0:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_0
+ mov cl, r10b
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_0
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_0
+L_mldsa_use_hint_88_avx2_hints_done_1_0:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1024], ymm4
+ vmovdqu YMMWORD PTR [rdx+1056], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1088]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1120]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_1:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_1
+ mov cl, r10b
+ sub cl, 16
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_1
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_1
+L_mldsa_use_hint_88_avx2_hints_done_1_1:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1088], ymm4
+ vmovdqu YMMWORD PTR [rdx+1120], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1152]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1184]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_2:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_2
+ mov cl, r10b
+ sub cl, 32
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_2
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_2
+L_mldsa_use_hint_88_avx2_hints_done_1_2:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1152], ymm4
+ vmovdqu YMMWORD PTR [rdx+1184], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1216]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1248]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_3:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_3
+ mov cl, r10b
+ sub cl, 48
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_3
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_3
+L_mldsa_use_hint_88_avx2_hints_done_1_3:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1216], ymm4
+ vmovdqu YMMWORD PTR [rdx+1248], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1280]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1312]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_4:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_4
+ mov cl, r10b
+ sub cl, 64
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_4
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_4
+L_mldsa_use_hint_88_avx2_hints_done_1_4:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1280], ymm4
+ vmovdqu YMMWORD PTR [rdx+1312], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1344]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1376]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_5:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_5
+ mov cl, r10b
+ sub cl, 80
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_5
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_5
+L_mldsa_use_hint_88_avx2_hints_done_1_5:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1344], ymm4
+ vmovdqu YMMWORD PTR [rdx+1376], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1408]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1440]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_6:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_6
+ mov cl, r10b
+ sub cl, 96
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_6
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_6
+L_mldsa_use_hint_88_avx2_hints_done_1_6:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1408], ymm4
+ vmovdqu YMMWORD PTR [rdx+1440], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1472]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1504]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_7:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_7
+ mov cl, r10b
+ sub cl, 112
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_7
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_7
+L_mldsa_use_hint_88_avx2_hints_done_1_7:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1472], ymm4
+ vmovdqu YMMWORD PTR [rdx+1504], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1536]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1568]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_8:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_8
+ mov cl, r10b
+ sub cl, 128
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_8
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_8
+L_mldsa_use_hint_88_avx2_hints_done_1_8:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1536], ymm4
+ vmovdqu YMMWORD PTR [rdx+1568], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1600]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1632]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_9:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_9
+ mov cl, r10b
+ sub cl, 144
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_9
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_9
+L_mldsa_use_hint_88_avx2_hints_done_1_9:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1600], ymm4
+ vmovdqu YMMWORD PTR [rdx+1632], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1664]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1696]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_10:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_10
+ mov cl, r10b
+ sub cl, 160
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_10
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_10
+L_mldsa_use_hint_88_avx2_hints_done_1_10:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1664], ymm4
+ vmovdqu YMMWORD PTR [rdx+1696], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1728]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1760]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_11:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_11
+ mov cl, r10b
+ sub cl, 176
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_11
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_11
+L_mldsa_use_hint_88_avx2_hints_done_1_11:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1728], ymm4
+ vmovdqu YMMWORD PTR [rdx+1760], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1792]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1824]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_12:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_12
+ mov cl, r10b
+ sub cl, 192
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_12
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_12
+L_mldsa_use_hint_88_avx2_hints_done_1_12:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1792], ymm4
+ vmovdqu YMMWORD PTR [rdx+1824], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1856]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1888]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_13:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_13
+ mov cl, r10b
+ sub cl, 208
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_13
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_13
+L_mldsa_use_hint_88_avx2_hints_done_1_13:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1856], ymm4
+ vmovdqu YMMWORD PTR [rdx+1888], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1920]
+ vmovdqu ymm1, YMMWORD PTR [rdx+1952]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_14:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_14
+ mov cl, r10b
+ sub cl, 224
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_14
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_14
+L_mldsa_use_hint_88_avx2_hints_done_1_14:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1920], ymm4
+ vmovdqu YMMWORD PTR [rdx+1952], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+1984]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2016]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_1_15:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_15
+ mov cl, r10b
+ sub cl, 240
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_1_15
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_1_15
+L_mldsa_use_hint_88_avx2_hints_done_1_15:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+1984], ymm4
+ vmovdqu YMMWORD PTR [rdx+2016], ymm5
+ ; 3/4 vectors
+ mov r8b, [rax+82]
+ vmovdqu ymm0, YMMWORD PTR [rdx+2048]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2080]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_0:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_0
+ mov cl, r10b
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_0
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_0
+L_mldsa_use_hint_88_avx2_hints_done_2_0:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2048], ymm4
+ vmovdqu YMMWORD PTR [rdx+2080], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2112]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2144]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_1:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_1
+ mov cl, r10b
+ sub cl, 16
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_1
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_1
+L_mldsa_use_hint_88_avx2_hints_done_2_1:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2112], ymm4
+ vmovdqu YMMWORD PTR [rdx+2144], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2176]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2208]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_2:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_2
+ mov cl, r10b
+ sub cl, 32
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_2
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_2
+L_mldsa_use_hint_88_avx2_hints_done_2_2:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2176], ymm4
+ vmovdqu YMMWORD PTR [rdx+2208], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2240]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2272]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_3:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_3
+ mov cl, r10b
+ sub cl, 48
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_3
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_3
+L_mldsa_use_hint_88_avx2_hints_done_2_3:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2240], ymm4
+ vmovdqu YMMWORD PTR [rdx+2272], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2304]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2336]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_4:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_4
+ mov cl, r10b
+ sub cl, 64
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_4
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_4
+L_mldsa_use_hint_88_avx2_hints_done_2_4:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2304], ymm4
+ vmovdqu YMMWORD PTR [rdx+2336], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2368]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2400]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_5:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_5
+ mov cl, r10b
+ sub cl, 80
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_5
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_5
+L_mldsa_use_hint_88_avx2_hints_done_2_5:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2368], ymm4
+ vmovdqu YMMWORD PTR [rdx+2400], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2432]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2464]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_6:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_6
+ mov cl, r10b
+ sub cl, 96
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_6
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_6
+L_mldsa_use_hint_88_avx2_hints_done_2_6:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2432], ymm4
+ vmovdqu YMMWORD PTR [rdx+2464], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2496]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2528]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_7:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_7
+ mov cl, r10b
+ sub cl, 112
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_7
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_7
+L_mldsa_use_hint_88_avx2_hints_done_2_7:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2496], ymm4
+ vmovdqu YMMWORD PTR [rdx+2528], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2560]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2592]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_8:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_8
+ mov cl, r10b
+ sub cl, 128
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_8
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_8
+L_mldsa_use_hint_88_avx2_hints_done_2_8:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2560], ymm4
+ vmovdqu YMMWORD PTR [rdx+2592], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2624]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2656]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_9:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_9
+ mov cl, r10b
+ sub cl, 144
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_9
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_9
+L_mldsa_use_hint_88_avx2_hints_done_2_9:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2624], ymm4
+ vmovdqu YMMWORD PTR [rdx+2656], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2688]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2720]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_10:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_10
+ mov cl, r10b
+ sub cl, 160
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_10
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_10
+L_mldsa_use_hint_88_avx2_hints_done_2_10:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2688], ymm4
+ vmovdqu YMMWORD PTR [rdx+2720], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2752]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2784]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_11:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_11
+ mov cl, r10b
+ sub cl, 176
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_11
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_11
+L_mldsa_use_hint_88_avx2_hints_done_2_11:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2752], ymm4
+ vmovdqu YMMWORD PTR [rdx+2784], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2816]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2848]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_12:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_12
+ mov cl, r10b
+ sub cl, 192
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_12
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_12
+L_mldsa_use_hint_88_avx2_hints_done_2_12:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2816], ymm4
+ vmovdqu YMMWORD PTR [rdx+2848], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2880]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2912]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_13:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_13
+ mov cl, r10b
+ sub cl, 208
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_13
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_13
+L_mldsa_use_hint_88_avx2_hints_done_2_13:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2880], ymm4
+ vmovdqu YMMWORD PTR [rdx+2912], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+2944]
+ vmovdqu ymm1, YMMWORD PTR [rdx+2976]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_14:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_14
+ mov cl, r10b
+ sub cl, 224
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_14
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_14
+L_mldsa_use_hint_88_avx2_hints_done_2_14:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+2944], ymm4
+ vmovdqu YMMWORD PTR [rdx+2976], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3008]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3040]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_2_15:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_15
+ mov cl, r10b
+ sub cl, 240
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_2_15
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_2_15
+L_mldsa_use_hint_88_avx2_hints_done_2_15:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3008], ymm4
+ vmovdqu YMMWORD PTR [rdx+3040], ymm5
+ ; 4/4 vectors
+ mov r8b, [rax+83]
+ vmovdqu ymm0, YMMWORD PTR [rdx+3072]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3104]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_0:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_0
+ mov cl, r10b
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_0
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_0
+L_mldsa_use_hint_88_avx2_hints_done_3_0:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3072], ymm4
+ vmovdqu YMMWORD PTR [rdx+3104], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3136]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3168]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_1:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_1
+ mov cl, r10b
+ sub cl, 16
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_1
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_1
+L_mldsa_use_hint_88_avx2_hints_done_3_1:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3136], ymm4
+ vmovdqu YMMWORD PTR [rdx+3168], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3200]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3232]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_2:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_2
+ mov cl, r10b
+ sub cl, 32
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_2
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_2
+L_mldsa_use_hint_88_avx2_hints_done_3_2:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3200], ymm4
+ vmovdqu YMMWORD PTR [rdx+3232], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3264]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3296]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_3:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_3
+ mov cl, r10b
+ sub cl, 48
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_3
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_3
+L_mldsa_use_hint_88_avx2_hints_done_3_3:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3264], ymm4
+ vmovdqu YMMWORD PTR [rdx+3296], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3328]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3360]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_4:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_4
+ mov cl, r10b
+ sub cl, 64
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_4
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_4
+L_mldsa_use_hint_88_avx2_hints_done_3_4:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3328], ymm4
+ vmovdqu YMMWORD PTR [rdx+3360], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3392]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3424]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_5:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_5
+ mov cl, r10b
+ sub cl, 80
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_5
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_5
+L_mldsa_use_hint_88_avx2_hints_done_3_5:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3392], ymm4
+ vmovdqu YMMWORD PTR [rdx+3424], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3456]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3488]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_6:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_6
+ mov cl, r10b
+ sub cl, 96
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_6
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_6
+L_mldsa_use_hint_88_avx2_hints_done_3_6:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3456], ymm4
+ vmovdqu YMMWORD PTR [rdx+3488], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3520]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3552]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_7:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_7
+ mov cl, r10b
+ sub cl, 112
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_7
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_7
+L_mldsa_use_hint_88_avx2_hints_done_3_7:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3520], ymm4
+ vmovdqu YMMWORD PTR [rdx+3552], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3584]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3616]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_8:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_8
+ mov cl, r10b
+ sub cl, 128
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_8
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_8
+L_mldsa_use_hint_88_avx2_hints_done_3_8:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3584], ymm4
+ vmovdqu YMMWORD PTR [rdx+3616], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3648]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3680]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_9:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_9
+ mov cl, r10b
+ sub cl, 144
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_9
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_9
+L_mldsa_use_hint_88_avx2_hints_done_3_9:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3648], ymm4
+ vmovdqu YMMWORD PTR [rdx+3680], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3712]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3744]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_10:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_10
+ mov cl, r10b
+ sub cl, 160
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_10
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_10
+L_mldsa_use_hint_88_avx2_hints_done_3_10:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3712], ymm4
+ vmovdqu YMMWORD PTR [rdx+3744], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3776]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3808]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_11:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_11
+ mov cl, r10b
+ sub cl, 176
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_11
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_11
+L_mldsa_use_hint_88_avx2_hints_done_3_11:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3776], ymm4
+ vmovdqu YMMWORD PTR [rdx+3808], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3840]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3872]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_12:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_12
+ mov cl, r10b
+ sub cl, 192
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_12
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_12
+L_mldsa_use_hint_88_avx2_hints_done_3_12:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3840], ymm4
+ vmovdqu YMMWORD PTR [rdx+3872], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3904]
+ vmovdqu ymm1, YMMWORD PTR [rdx+3936]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_13:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_13
+ mov cl, r10b
+ sub cl, 208
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_13
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_13
+L_mldsa_use_hint_88_avx2_hints_done_3_13:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3904], ymm4
+ vmovdqu YMMWORD PTR [rdx+3936], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+3968]
+ vmovdqu ymm1, YMMWORD PTR [rdx+4000]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_14:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_14
+ mov cl, r10b
+ sub cl, 224
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_14
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_14
+L_mldsa_use_hint_88_avx2_hints_done_3_14:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+3968], ymm4
+ vmovdqu YMMWORD PTR [rdx+4000], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+4032]
+ vmovdqu ymm1, YMMWORD PTR [rdx+4064]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpmulld ymm4, ymm0, ymm12
+ vpmulld ymm5, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm11
+ vpaddd ymm5, ymm5, ymm11
+ vpsrld ymm4, ymm4, 23
+ vpsrld ymm5, ymm5, 23
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpcmpeqd ymm0, ymm4, ymm12
+ vpcmpeqd ymm1, ymm5, ymm12
+ vpaddd ymm2, ymm2, ymm0
+ vpaddd ymm3, ymm3, ymm1
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ mov r11, 1
+ xor r12, r12
+ xor rcx, rcx
+L_mldsa_use_hint_88_avx2_hints_next_3_15:
+ cmp r9b, r8b
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_15
+ mov cl, r10b
+ sub cl, 240
+ cmp rcx, 16
+ jge L_mldsa_use_hint_88_avx2_hints_done_3_15
+ mov r13, r11
+ shl r13, cl
+ or r12, r13
+ inc r9b
+ mov r10b, [rax+r9]
+ jmp L_mldsa_use_hint_88_avx2_hints_next_3_15
+L_mldsa_use_hint_88_avx2_hints_done_3_15:
+ movd xmm6, r12d
+ shr r12, 8
+ movd xmm7, r12d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpcmpgtd ymm0, ymm12, ymm4
+ vpcmpgtd ymm1, ymm12, ymm5
+ vpand ymm4, ymm4, ymm0
+ vpand ymm5, ymm5, ymm1
+ vpsrad ymm0, ymm4, 31
+ vpsrad ymm1, ymm5, 31
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpaddd ymm4, ymm4, ymm0
+ vpaddd ymm5, ymm5, ymm1
+ vmovdqu YMMWORD PTR [rdx+4032], ymm4
+ vmovdqu YMMWORD PTR [rdx+4064], ymm5
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop r13
+ pop r12
+ ret
+wc_mldsa_use_hint_88_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h
+ptr_L_mldsa_use_hint_32_avx2_q QWORD L_mldsa_use_hint_32_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_q_low_32 DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h
+ DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h
+ptr_L_mldsa_use_hint_32_avx2_q_low_32 QWORD L_mldsa_use_hint_32_avx2_q_low_32
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_q_low_32_2 DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h
+ DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h
+ptr_L_mldsa_use_hint_32_avx2_q_low_32_2 QWORD L_mldsa_use_hint_32_avx2_q_low_32_2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_q_low_32_m1 DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh
+ DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh
+ptr_L_mldsa_use_hint_32_avx2_q_low_32_m1 QWORD L_mldsa_use_hint_32_avx2_q_low_32_m1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh
+ptr_L_mldsa_use_hint_32_avx2_mask QWORD L_mldsa_use_hint_32_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_vsl DWORD 0000001fh, 0000001eh, 0000001dh, 0000001ch
+ DWORD 0000001bh, 0000001ah, 00000019h, 00000018h
+ptr_L_mldsa_use_hint_32_avx2_vsl QWORD L_mldsa_use_hint_32_avx2_vsl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mldsa_use_hint_32_avx2_one DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ DWORD 00000001h, 00000001h, 00000001h, 00000001h
+ptr_L_mldsa_use_hint_32_avx2_one QWORD L_mldsa_use_hint_32_avx2_one
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_use_hint_32_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov rax, rdx
+ mov rdx, rcx
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm8, YMMWORD PTR L_mldsa_use_hint_32_avx2_q
+ vmovdqu ymm9, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32
+ vmovdqu ymm10, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32_2
+ vmovdqu ymm11, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32_m1
+ vmovdqu ymm12, YMMWORD PTR L_mldsa_use_hint_32_avx2_mask
+ vmovdqu ymm13, YMMWORD PTR L_mldsa_use_hint_32_avx2_vsl
+ vmovdqu ymm14, YMMWORD PTR L_mldsa_use_hint_32_avx2_one
+ xor r10, r10
+ mov r11b, [r8]
+ imul r15, rax, 10
+ sub r15, 5
+L_mldsa_use_hint_32_avx2_start_256:
+ mov r9b, [r8+r15]
+ inc r15
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__0:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__0
+ mov cl, r11b
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__0
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__0
+L_mldsa_use_hint_32_avx2_hints_done__0:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx], ymm4
+ vmovdqu YMMWORD PTR [rdx+32], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm1, YMMWORD PTR [rdx+96]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__1:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__1
+ mov cl, r11b
+ sub cl, 16
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__1
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__1
+L_mldsa_use_hint_32_avx2_hints_done__1:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+64], ymm4
+ vmovdqu YMMWORD PTR [rdx+96], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__2:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__2
+ mov cl, r11b
+ sub cl, 32
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__2
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__2
+L_mldsa_use_hint_32_avx2_hints_done__2:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm1, YMMWORD PTR [rdx+224]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__3:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__3
+ mov cl, r11b
+ sub cl, 48
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__3
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__3
+L_mldsa_use_hint_32_avx2_hints_done__3:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+192], ymm4
+ vmovdqu YMMWORD PTR [rdx+224], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__4:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__4
+ mov cl, r11b
+ sub cl, 64
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__4
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__4
+L_mldsa_use_hint_32_avx2_hints_done__4:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm1, YMMWORD PTR [rdx+352]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__5:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__5
+ mov cl, r11b
+ sub cl, 80
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__5
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__5
+L_mldsa_use_hint_32_avx2_hints_done__5:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+320], ymm4
+ vmovdqu YMMWORD PTR [rdx+352], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm1, YMMWORD PTR [rdx+416]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__6:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__6
+ mov cl, r11b
+ sub cl, 96
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__6
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__6
+L_mldsa_use_hint_32_avx2_hints_done__6:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm1, YMMWORD PTR [rdx+480]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__7:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__7
+ mov cl, r11b
+ sub cl, 112
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__7
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__7
+L_mldsa_use_hint_32_avx2_hints_done__7:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+448], ymm4
+ vmovdqu YMMWORD PTR [rdx+480], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+512]
+ vmovdqu ymm1, YMMWORD PTR [rdx+544]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__8:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__8
+ mov cl, r11b
+ sub cl, 128
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__8
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__8
+L_mldsa_use_hint_32_avx2_hints_done__8:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+512], ymm4
+ vmovdqu YMMWORD PTR [rdx+544], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+576]
+ vmovdqu ymm1, YMMWORD PTR [rdx+608]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__9:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__9
+ mov cl, r11b
+ sub cl, 144
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__9
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__9
+L_mldsa_use_hint_32_avx2_hints_done__9:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+576], ymm4
+ vmovdqu YMMWORD PTR [rdx+608], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+640]
+ vmovdqu ymm1, YMMWORD PTR [rdx+672]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__10:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__10
+ mov cl, r11b
+ sub cl, 160
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__10
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__10
+L_mldsa_use_hint_32_avx2_hints_done__10:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+640], ymm4
+ vmovdqu YMMWORD PTR [rdx+672], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+704]
+ vmovdqu ymm1, YMMWORD PTR [rdx+736]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__11:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__11
+ mov cl, r11b
+ sub cl, 176
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__11
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__11
+L_mldsa_use_hint_32_avx2_hints_done__11:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+704], ymm4
+ vmovdqu YMMWORD PTR [rdx+736], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+768]
+ vmovdqu ymm1, YMMWORD PTR [rdx+800]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__12:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__12
+ mov cl, r11b
+ sub cl, 192
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__12
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__12
+L_mldsa_use_hint_32_avx2_hints_done__12:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+768], ymm4
+ vmovdqu YMMWORD PTR [rdx+800], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+832]
+ vmovdqu ymm1, YMMWORD PTR [rdx+864]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__13:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__13
+ mov cl, r11b
+ sub cl, 208
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__13
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__13
+L_mldsa_use_hint_32_avx2_hints_done__13:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+832], ymm4
+ vmovdqu YMMWORD PTR [rdx+864], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+896]
+ vmovdqu ymm1, YMMWORD PTR [rdx+928]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__14:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__14
+ mov cl, r11b
+ sub cl, 224
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__14
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__14
+L_mldsa_use_hint_32_avx2_hints_done__14:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+896], ymm4
+ vmovdqu YMMWORD PTR [rdx+928], ymm5
+ vmovdqu ymm0, YMMWORD PTR [rdx+960]
+ vmovdqu ymm1, YMMWORD PTR [rdx+992]
+ vpsrad ymm2, ymm0, 31
+ vpsrad ymm3, ymm1, 31
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm4, ymm0, ymm11
+ vpaddd ymm5, ymm1, ymm11
+ vpsrld ymm4, ymm4, 19
+ vpsrld ymm5, ymm5, 19
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsubd ymm2, ymm9, ymm2
+ vpsubd ymm3, ymm9, ymm3
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpmulld ymm2, ymm4, ymm10
+ vpmulld ymm3, ymm5, ymm10
+ vpsubd ymm2, ymm0, ymm2
+ vpsubd ymm3, ymm1, ymm3
+ vpsrld ymm0, ymm4, 4
+ vpsrld ymm1, ymm5, 4
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm1
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ mov r12, 1
+ xor r13, r13
+ xor rcx, rcx
+L_mldsa_use_hint_32_avx2_hints_next__15:
+ cmp r10b, r9b
+ jge L_mldsa_use_hint_32_avx2_hints_done__15
+ mov cl, r11b
+ sub cl, 240
+ cmp rcx, 16
+ jge L_mldsa_use_hint_32_avx2_hints_done__15
+ mov r14, r12
+ shl r14, cl
+ or r13, r14
+ inc r10b
+ mov r11b, [r8+r10]
+ jmp L_mldsa_use_hint_32_avx2_hints_next__15
+L_mldsa_use_hint_32_avx2_hints_done__15:
+ movd xmm6, r13d
+ shr r13, 8
+ movd xmm7, r13d
+ vpbroadcastd ymm6, xmm6
+ vpbroadcastd ymm7, xmm7
+ vpsllvd ymm6, ymm6, ymm13
+ vpsllvd ymm7, ymm7, ymm13
+ vpsrad ymm6, ymm6, 31
+ vpsrad ymm7, ymm7, 31
+ vpsrld ymm2, ymm2, 31
+ vpsrld ymm3, ymm3, 31
+ vpslld ymm2, ymm2, 1
+ vpslld ymm3, ymm3, 1
+ vpsubd ymm2, ymm14, ymm2
+ vpsubd ymm3, ymm14, ymm3
+ vpand ymm2, ymm2, ymm6
+ vpand ymm3, ymm3, ymm7
+ vpaddd ymm4, ymm4, ymm2
+ vpaddd ymm5, ymm5, ymm3
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vmovdqu YMMWORD PTR [rdx+960], ymm4
+ vmovdqu YMMWORD PTR [rdx+992], ymm5
+ add rdx, 1024
+ sub rax, 1
+ jne L_mldsa_use_hint_32_avx2_start_256
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+wc_mldsa_use_hint_32_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_vec_check_low_avx2 PROC
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ sub r8d, 1
+ movd xmm2, r8d
+ neg r8d
+ movd xmm3, r8d
+ vpbroadcastd ymm2, xmm2
+ vpbroadcastd ymm3, xmm3
+L_mldsa_vec_check_low_vx2_start_256:
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+96]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+192]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+320]
+ vmovdqu ymm1, YMMWORD PTR [rcx+352]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+448]
+ vmovdqu ymm1, YMMWORD PTR [rcx+480]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+576]
+ vmovdqu ymm1, YMMWORD PTR [rcx+608]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+704]
+ vmovdqu ymm1, YMMWORD PTR [rcx+736]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+832]
+ vmovdqu ymm1, YMMWORD PTR [rcx+864]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+960]
+ vmovdqu ymm1, YMMWORD PTR [rcx+992]
+ vpcmpgtd ymm4, ymm0, ymm2
+ vpcmpgtd ymm5, ymm1, ymm2
+ vpcmpgtd ymm6, ymm3, ymm0
+ vpcmpgtd ymm7, ymm3, ymm1
+ vpor ymm4, ymm4, ymm5
+ vpor ymm6, ymm6, ymm7
+ vpor ymm4, ymm4, ymm6
+ vpmovmskb rax, ymm4
+ cmp rax, 0
+ mov rax, 0
+ jne L_mldsa_vec_check_low_vx2_done
+ add rcx, 1024
+ sub rdx, 1
+ jne L_mldsa_vec_check_low_vx2_start_256
+ mov rax, 1
+L_mldsa_vec_check_low_vx2_done:
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+wc_mldsa_vec_check_low_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_poly_add_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vmovdqu ymm8, YMMWORD PTR [rdx]
+ vmovdqu ymm9, YMMWORD PTR [rdx+32]
+ vmovdqu ymm10, YMMWORD PTR [rdx+64]
+ vmovdqu ymm11, YMMWORD PTR [rdx+96]
+ vmovdqu ymm12, YMMWORD PTR [rdx+128]
+ vmovdqu ymm13, YMMWORD PTR [rdx+160]
+ vmovdqu ymm14, YMMWORD PTR [rdx+192]
+ vmovdqu ymm15, YMMWORD PTR [rdx+224]
+ vpaddd ymm0, ymm0, ymm8
+ vpaddd ymm1, ymm1, ymm9
+ vpaddd ymm2, ymm2, ymm10
+ vpaddd ymm3, ymm3, ymm11
+ vpaddd ymm4, ymm4, ymm12
+ vpaddd ymm5, ymm5, ymm13
+ vpaddd ymm6, ymm6, ymm14
+ vpaddd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vmovdqu ymm8, YMMWORD PTR [rdx+256]
+ vmovdqu ymm9, YMMWORD PTR [rdx+288]
+ vmovdqu ymm10, YMMWORD PTR [rdx+320]
+ vmovdqu ymm11, YMMWORD PTR [rdx+352]
+ vmovdqu ymm12, YMMWORD PTR [rdx+384]
+ vmovdqu ymm13, YMMWORD PTR [rdx+416]
+ vmovdqu ymm14, YMMWORD PTR [rdx+448]
+ vmovdqu ymm15, YMMWORD PTR [rdx+480]
+ vpaddd ymm0, ymm0, ymm8
+ vpaddd ymm1, ymm1, ymm9
+ vpaddd ymm2, ymm2, ymm10
+ vpaddd ymm3, ymm3, ymm11
+ vpaddd ymm4, ymm4, ymm12
+ vpaddd ymm5, ymm5, ymm13
+ vpaddd ymm6, ymm6, ymm14
+ vpaddd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vmovdqu ymm8, YMMWORD PTR [rdx+512]
+ vmovdqu ymm9, YMMWORD PTR [rdx+544]
+ vmovdqu ymm10, YMMWORD PTR [rdx+576]
+ vmovdqu ymm11, YMMWORD PTR [rdx+608]
+ vmovdqu ymm12, YMMWORD PTR [rdx+640]
+ vmovdqu ymm13, YMMWORD PTR [rdx+672]
+ vmovdqu ymm14, YMMWORD PTR [rdx+704]
+ vmovdqu ymm15, YMMWORD PTR [rdx+736]
+ vpaddd ymm0, ymm0, ymm8
+ vpaddd ymm1, ymm1, ymm9
+ vpaddd ymm2, ymm2, ymm10
+ vpaddd ymm3, ymm3, ymm11
+ vpaddd ymm4, ymm4, ymm12
+ vpaddd ymm5, ymm5, ymm13
+ vpaddd ymm6, ymm6, ymm14
+ vpaddd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vmovdqu ymm8, YMMWORD PTR [rdx+768]
+ vmovdqu ymm9, YMMWORD PTR [rdx+800]
+ vmovdqu ymm10, YMMWORD PTR [rdx+832]
+ vmovdqu ymm11, YMMWORD PTR [rdx+864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+896]
+ vmovdqu ymm13, YMMWORD PTR [rdx+928]
+ vmovdqu ymm14, YMMWORD PTR [rdx+960]
+ vmovdqu ymm15, YMMWORD PTR [rdx+992]
+ vpaddd ymm0, ymm0, ymm8
+ vpaddd ymm1, ymm1, ymm9
+ vpaddd ymm2, ymm2, ymm10
+ vpaddd ymm3, ymm3, ymm11
+ vpaddd ymm4, ymm4, ymm12
+ vpaddd ymm5, ymm5, ymm13
+ vpaddd ymm6, ymm6, ymm14
+ vpaddd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_poly_add_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_poly_sub_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vmovdqu ymm8, YMMWORD PTR [rdx]
+ vmovdqu ymm9, YMMWORD PTR [rdx+32]
+ vmovdqu ymm10, YMMWORD PTR [rdx+64]
+ vmovdqu ymm11, YMMWORD PTR [rdx+96]
+ vmovdqu ymm12, YMMWORD PTR [rdx+128]
+ vmovdqu ymm13, YMMWORD PTR [rdx+160]
+ vmovdqu ymm14, YMMWORD PTR [rdx+192]
+ vmovdqu ymm15, YMMWORD PTR [rdx+224]
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpsubd ymm2, ymm2, ymm10
+ vpsubd ymm3, ymm3, ymm11
+ vpsubd ymm4, ymm4, ymm12
+ vpsubd ymm5, ymm5, ymm13
+ vpsubd ymm6, ymm6, ymm14
+ vpsubd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vmovdqu ymm8, YMMWORD PTR [rdx+256]
+ vmovdqu ymm9, YMMWORD PTR [rdx+288]
+ vmovdqu ymm10, YMMWORD PTR [rdx+320]
+ vmovdqu ymm11, YMMWORD PTR [rdx+352]
+ vmovdqu ymm12, YMMWORD PTR [rdx+384]
+ vmovdqu ymm13, YMMWORD PTR [rdx+416]
+ vmovdqu ymm14, YMMWORD PTR [rdx+448]
+ vmovdqu ymm15, YMMWORD PTR [rdx+480]
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpsubd ymm2, ymm2, ymm10
+ vpsubd ymm3, ymm3, ymm11
+ vpsubd ymm4, ymm4, ymm12
+ vpsubd ymm5, ymm5, ymm13
+ vpsubd ymm6, ymm6, ymm14
+ vpsubd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vmovdqu ymm4, YMMWORD PTR [rcx+640]
+ vmovdqu ymm5, YMMWORD PTR [rcx+672]
+ vmovdqu ymm6, YMMWORD PTR [rcx+704]
+ vmovdqu ymm7, YMMWORD PTR [rcx+736]
+ vmovdqu ymm8, YMMWORD PTR [rdx+512]
+ vmovdqu ymm9, YMMWORD PTR [rdx+544]
+ vmovdqu ymm10, YMMWORD PTR [rdx+576]
+ vmovdqu ymm11, YMMWORD PTR [rdx+608]
+ vmovdqu ymm12, YMMWORD PTR [rdx+640]
+ vmovdqu ymm13, YMMWORD PTR [rdx+672]
+ vmovdqu ymm14, YMMWORD PTR [rdx+704]
+ vmovdqu ymm15, YMMWORD PTR [rdx+736]
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpsubd ymm2, ymm2, ymm10
+ vpsubd ymm3, ymm3, ymm11
+ vpsubd ymm4, ymm4, ymm12
+ vpsubd ymm5, ymm5, ymm13
+ vpsubd ymm6, ymm6, ymm14
+ vpsubd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu YMMWORD PTR [rcx+640], ymm4
+ vmovdqu YMMWORD PTR [rcx+672], ymm5
+ vmovdqu YMMWORD PTR [rcx+704], ymm6
+ vmovdqu YMMWORD PTR [rcx+736], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vmovdqu ymm4, YMMWORD PTR [rcx+896]
+ vmovdqu ymm5, YMMWORD PTR [rcx+928]
+ vmovdqu ymm6, YMMWORD PTR [rcx+960]
+ vmovdqu ymm7, YMMWORD PTR [rcx+992]
+ vmovdqu ymm8, YMMWORD PTR [rdx+768]
+ vmovdqu ymm9, YMMWORD PTR [rdx+800]
+ vmovdqu ymm10, YMMWORD PTR [rdx+832]
+ vmovdqu ymm11, YMMWORD PTR [rdx+864]
+ vmovdqu ymm12, YMMWORD PTR [rdx+896]
+ vmovdqu ymm13, YMMWORD PTR [rdx+928]
+ vmovdqu ymm14, YMMWORD PTR [rdx+960]
+ vmovdqu ymm15, YMMWORD PTR [rdx+992]
+ vpsubd ymm0, ymm0, ymm8
+ vpsubd ymm1, ymm1, ymm9
+ vpsubd ymm2, ymm2, ymm10
+ vpsubd ymm3, ymm3, ymm11
+ vpsubd ymm4, ymm4, ymm12
+ vpsubd ymm5, ymm5, ymm13
+ vpsubd ymm6, ymm6, ymm14
+ vpsubd ymm7, ymm7, ymm15
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu YMMWORD PTR [rcx+896], ymm4
+ vmovdqu YMMWORD PTR [rcx+928], ymm5
+ vmovdqu YMMWORD PTR [rcx+960], ymm6
+ vmovdqu YMMWORD PTR [rcx+992], ymm7
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+wc_mldsa_poly_sub_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+wc_mldsa_poly_make_pos_avx2 PROC
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vpxor ymm8, ymm8, ymm8
+ vmovdqu ymm9, YMMWORD PTR mldsa_q
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm2
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vmovdqu ymm2, YMMWORD PTR [rcx+576]
+ vmovdqu ymm3, YMMWORD PTR [rcx+608]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+512], ymm0
+ vmovdqu YMMWORD PTR [rcx+544], ymm1
+ vmovdqu YMMWORD PTR [rcx+576], ymm2
+ vmovdqu YMMWORD PTR [rcx+608], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vmovdqu ymm2, YMMWORD PTR [rcx+704]
+ vmovdqu ymm3, YMMWORD PTR [rcx+736]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+640], ymm0
+ vmovdqu YMMWORD PTR [rcx+672], ymm1
+ vmovdqu YMMWORD PTR [rcx+704], ymm2
+ vmovdqu YMMWORD PTR [rcx+736], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vmovdqu ymm2, YMMWORD PTR [rcx+832]
+ vmovdqu ymm3, YMMWORD PTR [rcx+864]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+768], ymm0
+ vmovdqu YMMWORD PTR [rcx+800], ymm1
+ vmovdqu YMMWORD PTR [rcx+832], ymm2
+ vmovdqu YMMWORD PTR [rcx+864], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vmovdqu ymm2, YMMWORD PTR [rcx+960]
+ vmovdqu ymm3, YMMWORD PTR [rcx+992]
+ vpcmpgtd ymm4, ymm8, ymm0
+ vpcmpgtd ymm5, ymm8, ymm1
+ vpcmpgtd ymm6, ymm8, ymm2
+ vpcmpgtd ymm7, ymm8, ymm3
+ vpand ymm4, ymm4, ymm9
+ vpand ymm5, ymm5, ymm9
+ vpand ymm6, ymm6, ymm9
+ vpand ymm7, ymm7, ymm9
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+896], ymm0
+ vmovdqu YMMWORD PTR [rcx+928], ymm1
+ vmovdqu YMMWORD PTR [rcx+960], ymm2
+ vmovdqu YMMWORD PTR [rcx+992], ymm3
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ ret
+wc_mldsa_poly_make_pos_avx2 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfcrypt/src/wc_mlkem_asm.asm b/wolfcrypt/src/wc_mlkem_asm.asm
new file mode 100644
index 00000000000..62743aa846d
--- /dev/null
+++ b/wolfcrypt/src/wc_mlkem_asm.asm
@@ -0,0 +1,15435 @@
+; /* wc_mlkem_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+IFDEF WOLFSSL_HAVE_MLKEM
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+mlkem_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ptr_mlkem_q QWORD mlkem_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mlkem_qinv WORD 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h
+ WORD 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h
+ptr_mlkem_qinv QWORD mlkem_qinv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mlkem_f WORD 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h
+ WORD 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h
+ptr_mlkem_f QWORD mlkem_f
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mlkem_f_qinv WORD 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h
+ WORD 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h
+ptr_mlkem_f_qinv QWORD mlkem_f_qinv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+mlkem_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ptr_mlkem_v QWORD mlkem_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_avx2_zetas WORD 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh
+ WORD 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh
+ WORD 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh
+ WORD 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh
+ WORD 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah
+ WORD 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah
+ WORD 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah
+ WORD 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah
+ WORD 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h
+ WORD 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h
+ WORD 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h
+ WORD 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h
+ WORD 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh
+ WORD 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh
+ WORD 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh
+ WORD 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh
+ WORD 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h
+ WORD 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h
+ WORD 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h
+ WORD 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h
+ WORD 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh
+ WORD 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh
+ WORD 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh
+ WORD 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh
+ WORD 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h
+ WORD 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h
+ WORD 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h
+ WORD 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h
+ WORD 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h
+ WORD 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h
+ WORD 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h
+ WORD 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h
+ WORD 023dh, 023dh, 023dh, 023dh, 023dh, 023dh, 023dh, 023dh
+ WORD 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h
+ WORD 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh
+ WORD 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h
+ WORD 0108h, 0108h, 0108h, 0108h, 0108h, 0108h, 0108h, 0108h
+ WORD 017fh, 017fh, 017fh, 017fh, 017fh, 017fh, 017fh, 017fh
+ WORD 9908h, 9908h, 9908h, 9908h, 9908h, 9908h, 9908h, 9908h
+ WORD 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh
+ WORD 04c7h, 04c7h, 04c7h, 04c7h, 028ch, 028ch, 028ch, 028ch
+ WORD 0ad9h, 0ad9h, 0ad9h, 0ad9h, 03f7h, 03f7h, 03f7h, 03f7h
+ WORD 0e9c7h, 0e9c7h, 0e9c7h, 0e9c7h, 0e68ch, 0e68ch, 0e68ch, 0e68ch
+ WORD 05d9h, 05d9h, 05d9h, 05d9h, 78f7h, 78f7h, 78f7h, 78f7h
+ WORD 07f4h, 07f4h, 07f4h, 07f4h, 05d3h, 05d3h, 05d3h, 05d3h
+ WORD 0be7h, 0be7h, 0be7h, 0be7h, 06f9h, 06f9h, 06f9h, 06f9h
+ WORD 0a3f4h, 0a3f4h, 0a3f4h, 0a3f4h, 4ed3h, 4ed3h, 4ed3h, 4ed3h
+ WORD 50e7h, 50e7h, 50e7h, 50e7h, 61f9h, 61f9h, 61f9h, 61f9h
+ WORD 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h
+ WORD 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h
+ WORD 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h
+ WORD 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h
+ WORD 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh
+ WORD 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh
+ WORD 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh
+ WORD 997fh, 997fh, 997fh, 997fh, 997fh, 997fh, 997fh, 997fh
+ WORD 0204h, 0204h, 0204h, 0204h, 0cf9h, 0cf9h, 0cf9h, 0cf9h
+ WORD 0bc1h, 0bc1h, 0bc1h, 0bc1h, 0a67h, 0a67h, 0a67h, 0a67h
+ WORD 0ce04h, 0ce04h, 0ce04h, 0ce04h, 67f9h, 67f9h, 67f9h, 67f9h
+ WORD 3ec1h, 3ec1h, 3ec1h, 3ec1h, 0cf67h, 0cf67h, 0cf67h, 0cf67h
+ WORD 06afh, 06afh, 06afh, 06afh, 0877h, 0877h, 0877h, 0877h
+ WORD 007eh, 007eh, 007eh, 007eh, 05bdh, 05bdh, 05bdh, 05bdh
+ WORD 23afh, 23afh, 23afh, 23afh, 0fd77h, 0fd77h, 0fd77h, 0fd77h
+ WORD 9a7eh, 9a7eh, 9a7eh, 9a7eh, 6cbdh, 6cbdh, 6cbdh, 6cbdh
+ WORD 08b2h, 08b2h, 01aeh, 01aeh, 022bh, 022bh, 034bh, 034bh
+ WORD 081eh, 081eh, 0367h, 0367h, 060eh, 060eh, 0069h, 0069h
+ WORD 0feb2h, 0feb2h, 2baeh, 2baeh, 0d32bh, 0d32bh, 344bh, 344bh
+ WORD 821eh, 821eh, 0c867h, 0c867h, 500eh, 500eh, 0ab69h, 0ab69h
+ WORD 01a6h, 01a6h, 024bh, 024bh, 00b1h, 00b1h, 0c16h, 0c16h
+ WORD 0bdeh, 0bdeh, 0b35h, 0b35h, 0626h, 0626h, 0675h, 0675h
+ WORD 93a6h, 93a6h, 334bh, 334bh, 03b1h, 03b1h, 0ee16h, 0ee16h
+ WORD 0c5deh, 0c5deh, 5a35h, 5a35h, 1826h, 1826h, 1575h, 1575h
+ WORD 0c0bh, 0c0bh, 030ah, 030ah, 0487h, 0487h, 0c6eh, 0c6eh
+ WORD 09f8h, 09f8h, 05cbh, 05cbh, 0aa7h, 0aa7h, 045fh, 045fh
+ WORD 7d0bh, 7d0bh, 810ah, 810ah, 2987h, 2987h, 766eh, 766eh
+ WORD 71f8h, 71f8h, 0b6cbh, 0b6cbh, 8fa7h, 8fa7h, 315fh, 315fh
+ WORD 06cbh, 06cbh, 0284h, 0284h, 0999h, 0999h, 015dh, 015dh
+ WORD 01a2h, 01a2h, 0149h, 0149h, 0c65h, 0c65h, 0cb6h, 0cb6h
+ WORD 0b7cbh, 0b7cbh, 4e84h, 4e84h, 4499h, 4499h, 485dh, 485dh
+ WORD 0c7a2h, 0c7a2h, 4c49h, 4c49h, 0eb65h, 0eb65h, 0ceb6h, 0ceb6h
+ WORD 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h
+ WORD 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h
+ WORD 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h
+ WORD 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h
+ WORD 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh
+ WORD 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh
+ WORD 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh
+ WORD 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh
+ WORD 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah
+ WORD 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah
+ WORD 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah
+ WORD 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah
+ WORD 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h
+ WORD 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h
+ WORD 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h
+ WORD 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h
+ WORD 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh
+ WORD 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh
+ WORD 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh
+ WORD 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh
+ WORD 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh
+ WORD 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh
+ WORD 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh
+ WORD 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh
+ WORD 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch
+ WORD 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch
+ WORD 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch
+ WORD 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch
+ WORD 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h
+ WORD 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h
+ WORD 9258h, 9258h, 9258h, 9258h, 9258h, 9258h, 9258h, 9258h
+ WORD 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h
+ WORD 02dch, 02dch, 02dch, 02dch, 02dch, 02dch, 02dch, 02dch
+ WORD 0260h, 0260h, 0260h, 0260h, 0260h, 0260h, 0260h, 0260h
+ WORD 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch
+ WORD 2260h, 2260h, 2260h, 2260h, 2260h, 2260h, 2260h, 2260h
+ WORD 09ach, 09ach, 09ach, 09ach, 0ca7h, 0ca7h, 0ca7h, 0ca7h
+ WORD 0bf2h, 0bf2h, 0bf2h, 0bf2h, 033eh, 033eh, 033eh, 033eh
+ WORD 4dach, 4dach, 4dach, 4dach, 91a7h, 91a7h, 91a7h, 91a7h
+ WORD 0c1f2h, 0c1f2h, 0c1f2h, 0c1f2h, 0dd3eh, 0dd3eh, 0dd3eh, 0dd3eh
+ WORD 006bh, 006bh, 006bh, 006bh, 0774h, 0774h, 0774h, 0774h
+ WORD 0c0ah, 0c0ah, 0c0ah, 0c0ah, 094ah, 094ah, 094ah, 094ah
+ WORD 916bh, 916bh, 916bh, 916bh, 2374h, 2374h, 2374h, 2374h
+ WORD 8a0ah, 8a0ah, 8a0ah, 8a0ah, 474ah, 474ah, 474ah, 474ah
+ WORD 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh
+ WORD 019bh, 019bh, 019bh, 019bh, 019bh, 019bh, 019bh, 019bh
+ WORD 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh
+ WORD 229bh, 229bh, 229bh, 229bh, 229bh, 229bh, 229bh, 229bh
+ WORD 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h
+ WORD 06deh, 06deh, 06deh, 06deh, 06deh, 06deh, 06deh, 06deh
+ WORD 6834h, 6834h, 6834h, 6834h, 6834h, 6834h, 6834h, 6834h
+ WORD 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh
+ WORD 0b73h, 0b73h, 0b73h, 0b73h, 03c1h, 03c1h, 03c1h, 03c1h
+ WORD 071dh, 071dh, 071dh, 071dh, 0a2ch, 0a2ch, 0a2ch, 0a2ch
+ WORD 3473h, 3473h, 3473h, 3473h, 36c1h, 36c1h, 36c1h, 36c1h
+ WORD 8e1dh, 8e1dh, 8e1dh, 8e1dh, 0ce2ch, 0ce2ch, 0ce2ch, 0ce2ch
+ WORD 01c0h, 01c0h, 01c0h, 01c0h, 08d8h, 08d8h, 08d8h, 08d8h
+ WORD 02a5h, 02a5h, 02a5h, 02a5h, 0806h, 0806h, 0806h, 0806h
+ WORD 41c0h, 41c0h, 41c0h, 41c0h, 10d8h, 10d8h, 10d8h, 10d8h
+ WORD 0a1a5h, 0a1a5h, 0a1a5h, 0a1a5h, 0ba06h, 0ba06h, 0ba06h, 0ba06h
+ WORD 0331h, 0331h, 0449h, 0449h, 025bh, 025bh, 0262h, 0262h
+ WORD 052ah, 052ah, 07fch, 07fch, 0748h, 0748h, 0180h, 0180h
+ WORD 8631h, 8631h, 4f49h, 4f49h, 635bh, 635bh, 0862h, 0862h
+ WORD 0e32ah, 0e32ah, 3bfch, 3bfch, 5f48h, 5f48h, 8180h, 8180h
+ WORD 0842h, 0842h, 0c79h, 0c79h, 04c2h, 04c2h, 07cah, 07cah
+ WORD 0997h, 0997h, 00dch, 00dch, 085eh, 085eh, 0686h, 0686h
+ WORD 0ae42h, 0ae42h, 0e779h, 0e779h, 2ac2h, 2ac2h, 0c5cah, 0c5cah
+ WORD 5e97h, 5e97h, 0d4dch, 0d4dch, 425eh, 425eh, 3886h, 3886h
+ WORD 0860h, 0860h, 0707h, 0707h, 0803h, 0803h, 031ah, 031ah
+ WORD 071bh, 071bh, 09abh, 09abh, 099bh, 099bh, 01deh, 01deh
+ WORD 2860h, 2860h, 0ac07h, 0ac07h, 0e103h, 0e103h, 0b11ah, 0b11ah
+ WORD 0a81bh, 0a81bh, 5aabh, 5aabh, 2a9bh, 2a9bh, 0bbdeh, 0bbdeh
+ WORD 0c95h, 0c95h, 0bcdh, 0bcdh, 03e4h, 03e4h, 03dfh, 03dfh
+ WORD 03beh, 03beh, 074dh, 074dh, 05f2h, 05f2h, 065ch, 065ch
+ WORD 7b95h, 7b95h, 0a2cdh, 0a2cdh, 6fe4h, 6fe4h, 0b0dfh, 0b0dfh
+ WORD 5dbeh, 5dbeh, 1e4dh, 1e4dh, 0bbf2h, 0bbf2h, 5a5ch, 5a5ch
+ptr_L_mlkem_avx2_zetas QWORD L_mlkem_avx2_zetas
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_avx2_zetas_basemul WORD 08b2h, 081eh, 0f74eh, 0f7e2h, 01aeh, 0367h, 0fe52h, 0fc99h
+ WORD 022bh, 060eh, 0fdd5h, 0f9f2h, 034bh, 0069h, 0fcb5h, 0ff97h
+ WORD 0feb2h, 821eh, 014eh, 7de2h, 2baeh, 0c867h, 0d452h, 3799h
+ WORD 0d32bh, 500eh, 2cd5h, 0aff2h, 344bh, 0ab69h, 0cbb5h, 5497h
+ WORD 01a6h, 0bdeh, 0fe5ah, 0f422h, 024bh, 0b35h, 0fdb5h, 0f4cbh
+ WORD 00b1h, 0626h, 0ff4fh, 0f9dah, 0c16h, 0675h, 0f3eah, 0f98bh
+ WORD 93a6h, 0c5deh, 6c5ah, 3a22h, 334bh, 5a35h, 0ccb5h, 0a5cbh
+ WORD 03b1h, 1826h, 0fc4fh, 0e7dah, 0ee16h, 1575h, 11eah, 0ea8bh
+ WORD 0c0bh, 09f8h, 0f3f5h, 0f608h, 030ah, 05cbh, 0fcf6h, 0fa35h
+ WORD 0487h, 0aa7h, 0fb79h, 0f559h, 0c6eh, 045fh, 0f392h, 0fba1h
+ WORD 7d0bh, 71f8h, 82f5h, 8e08h, 810ah, 0b6cbh, 7ef6h, 4935h
+ WORD 2987h, 8fa7h, 0d679h, 7059h, 766eh, 315fh, 8992h, 0cea1h
+ WORD 06cbh, 01a2h, 0f935h, 0fe5eh, 0284h, 0149h, 0fd7ch, 0feb7h
+ WORD 0999h, 0c65h, 0f667h, 0f39bh, 015dh, 0cb6h, 0fea3h, 0f34ah
+ WORD 0b7cbh, 0c7a2h, 4835h, 385eh, 4e84h, 4c49h, 0b17ch, 0b3b7h
+ WORD 4499h, 0eb65h, 0bb67h, 149bh, 485dh, 0ceb6h, 0b7a3h, 314ah
+ WORD 0331h, 052ah, 0fccfh, 0fad6h, 0449h, 07fch, 0fbb7h, 0f804h
+ WORD 025bh, 0748h, 0fda5h, 0f8b8h, 0262h, 0180h, 0fd9eh, 0fe80h
+ WORD 8631h, 0e32ah, 79cfh, 1cd6h, 4f49h, 3bfch, 0b0b7h, 0c404h
+ WORD 635bh, 5f48h, 9ca5h, 0a0b8h, 0862h, 8180h, 0f79eh, 7e80h
+ WORD 0842h, 0997h, 0f7beh, 0f669h, 0c79h, 00dch, 0f387h, 0ff24h
+ WORD 04c2h, 085eh, 0fb3eh, 0f7a2h, 07cah, 0686h, 0f836h, 0f97ah
+ WORD 0ae42h, 5e97h, 51beh, 0a169h, 0e779h, 0d4dch, 1887h, 2b24h
+ WORD 2ac2h, 425eh, 0d53eh, 0bda2h, 0c5cah, 3886h, 3a36h, 0c77ah
+ WORD 0860h, 071bh, 0f7a0h, 0f8e5h, 0707h, 09abh, 0f8f9h, 0f655h
+ WORD 0803h, 099bh, 0f7fdh, 0f665h, 031ah, 01deh, 0fce6h, 0fe22h
+ WORD 2860h, 0a81bh, 0d7a0h, 57e5h, 0ac07h, 5aabh, 53f9h, 0a555h
+ WORD 0e103h, 2a9bh, 1efdh, 0d565h, 0b11ah, 0bbdeh, 4ee6h, 4422h
+ WORD 0c95h, 03beh, 0f36bh, 0fc42h, 0bcdh, 074dh, 0f433h, 0f8b3h
+ WORD 03e4h, 05f2h, 0fc1ch, 0fa0eh, 03dfh, 065ch, 0fc21h, 0f9a4h
+ WORD 7b95h, 5dbeh, 846bh, 0a242h, 0a2cdh, 1e4dh, 5d33h, 0e1b3h
+ WORD 6fe4h, 0bbf2h, 901ch, 440eh, 0b0dfh, 5a5ch, 4f21h, 0a5a4h
+ptr_L_mlkem_avx2_zetas_basemul QWORD L_mlkem_avx2_zetas_basemul
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_avx2_zetas_inv WORD 06a5h, 06a5h, 05b4h, 05b4h, 070fh, 070fh, 0943h, 0943h
+ WORD 0922h, 0922h, 0134h, 0134h, 091dh, 091dh, 006ch, 006ch
+ WORD 0a5a5h, 0a5a5h, 0e1b4h, 0e1b4h, 440fh, 440fh, 0a243h, 0a243h
+ WORD 4f22h, 4f22h, 5d34h, 5d34h, 901dh, 901dh, 846ch, 846ch
+ WORD 0b23h, 0b23h, 0356h, 0356h, 0366h, 0366h, 05e6h, 05e6h
+ WORD 09e7h, 09e7h, 05fah, 05fah, 04feh, 04feh, 04a1h, 04a1h
+ WORD 4423h, 4423h, 0a556h, 0a556h, 0d566h, 0d566h, 57e6h, 57e6h
+ WORD 4ee7h, 4ee7h, 53fah, 53fah, 1efeh, 1efeh, 0d7a1h, 0d7a1h
+ WORD 04fbh, 04fbh, 04fbh, 04fbh, 0a5ch, 0a5ch, 0a5ch, 0a5ch
+ WORD 0429h, 0429h, 0429h, 0429h, 0b41h, 0b41h, 0b41h, 0b41h
+ WORD 45fbh, 45fbh, 45fbh, 45fbh, 5e5ch, 5e5ch, 5e5ch, 5e5ch
+ WORD 0ef29h, 0ef29h, 0ef29h, 0ef29h, 0be41h, 0be41h, 0be41h, 0be41h
+ WORD 02d5h, 02d5h, 02d5h, 02d5h, 05e4h, 05e4h, 05e4h, 05e4h
+ WORD 0940h, 0940h, 0940h, 0940h, 018eh, 018eh, 018eh, 018eh
+ WORD 31d5h, 31d5h, 31d5h, 31d5h, 71e4h, 71e4h, 71e4h, 71e4h
+ WORD 0c940h, 0c940h, 0c940h, 0c940h, 0cb8eh, 0cb8eh, 0cb8eh, 0cb8eh
+ WORD 0623h, 0623h, 0623h, 0623h, 0623h, 0623h, 0623h, 0623h
+ WORD 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh
+ WORD 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h
+ WORD 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh
+ WORD 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h
+ WORD 0606h, 0606h, 0606h, 0606h, 0606h, 0606h, 0606h, 0606h
+ WORD 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h
+ WORD 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h
+ WORD 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h
+ WORD 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h
+ WORD 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h
+ WORD 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h
+ WORD 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h
+ WORD 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h
+ WORD 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h
+ WORD 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h
+ WORD 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h
+ WORD 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h
+ WORD 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h
+ WORD 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h
+ WORD 067bh, 067bh, 0c25h, 0c25h, 04a3h, 04a3h, 036ah, 036ah
+ WORD 0537h, 0537h, 0088h, 0088h, 083fh, 083fh, 04bfh, 04bfh
+ WORD 0c77bh, 0c77bh, 2b25h, 2b25h, 0bda3h, 0bda3h, 0a16ah, 0a16ah
+ WORD 3a37h, 3a37h, 1888h, 1888h, 0d53fh, 0d53fh, 51bfh, 51bfh
+ WORD 0b81h, 0b81h, 0505h, 0505h, 05b9h, 05b9h, 07d7h, 07d7h
+ WORD 0a9fh, 0a9fh, 08b8h, 08b8h, 0aa6h, 0aa6h, 09d0h, 09d0h
+ WORD 7e81h, 7e81h, 0c405h, 0c405h, 0a0b9h, 0a0b9h, 1cd7h, 1cd7h
+ WORD 0f79fh, 0f79fh, 0b0b8h, 0b0b8h, 9ca6h, 9ca6h, 79d0h, 79d0h
+ WORD 03b7h, 03b7h, 03b7h, 03b7h, 00f7h, 00f7h, 00f7h, 00f7h
+ WORD 058dh, 058dh, 058dh, 058dh, 0c96h, 0c96h, 0c96h, 0c96h
+ WORD 0b8b7h, 0b8b7h, 0b8b7h, 0b8b7h, 75f7h, 75f7h, 75f7h, 75f7h
+ WORD 0dc8dh, 0dc8dh, 0dc8dh, 0dc8dh, 6e96h, 6e96h, 6e96h, 6e96h
+ WORD 09c3h, 09c3h, 09c3h, 09c3h, 010fh, 010fh, 010fh, 010fh
+ WORD 005ah, 005ah, 005ah, 005ah, 0355h, 0355h, 0355h, 0355h
+ WORD 22c3h, 22c3h, 22c3h, 22c3h, 3e0fh, 3e0fh, 3e0fh, 3e0fh
+ WORD 6e5ah, 6e5ah, 6e5ah, 6e5ah, 0b255h, 0b255h, 0b255h, 0b255h
+ WORD 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h
+ WORD 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h
+ WORD 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h
+ WORD 2925h, 2925h, 2925h, 2925h, 2925h, 2925h, 2925h, 2925h
+ WORD 0908h, 0908h, 0908h, 0908h, 0908h, 0908h, 0908h, 0908h
+ WORD 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h
+ WORD 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h
+ WORD 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h
+ WORD 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h
+ WORD 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h
+ WORD 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h
+ WORD 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h
+ WORD 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh
+ WORD 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh
+ WORD 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh
+ WORD 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh
+ WORD 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h
+ WORD 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h
+ WORD 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h
+ WORD 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h
+ WORD 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh
+ WORD 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh
+ WORD 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh
+ WORD 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh
+ WORD 004bh, 004bh, 0bb8h, 0bb8h, 009ch, 009ch, 0b5fh, 0b5fh
+ WORD 0ba4h, 0ba4h, 0a7dh, 0a7dh, 0368h, 0368h, 0636h, 0636h
+ WORD 314bh, 314bh, 0b3b8h, 0b3b8h, 149ch, 149ch, 385fh, 385fh
+ WORD 0b7a4h, 0b7a4h, 0b17dh, 0b17dh, 0bb68h, 0bb68h, 4836h, 4836h
+ WORD 08a2h, 08a2h, 0736h, 0736h, 025ah, 025ah, 0309h, 0309h
+ WORD 0093h, 0093h, 09f7h, 09f7h, 087ah, 087ah, 00f6h, 00f6h
+ WORD 0cea2h, 0cea2h, 4936h, 4936h, 705ah, 705ah, 8e09h, 8e09h
+ WORD 8993h, 8993h, 7ef7h, 7ef7h, 0d67ah, 0d67ah, 82f6h, 82f6h
+ WORD 0744h, 0744h, 0744h, 0744h, 0c83h, 0c83h, 0c83h, 0c83h
+ WORD 048ah, 048ah, 048ah, 048ah, 0652h, 0652h, 0652h, 0652h
+ WORD 9344h, 9344h, 9344h, 9344h, 6583h, 6583h, 6583h, 6583h
+ WORD 028ah, 028ah, 028ah, 028ah, 0dc52h, 0dc52h, 0dc52h, 0dc52h
+ WORD 029ah, 029ah, 029ah, 029ah, 0140h, 0140h, 0140h, 0140h
+ WORD 0008h, 0008h, 0008h, 0008h, 0afdh, 0afdh, 0afdh, 0afdh
+ WORD 309ah, 309ah, 309ah, 309ah, 0c140h, 0c140h, 0c140h, 0c140h
+ WORD 9808h, 9808h, 9808h, 9808h, 31fdh, 31fdh, 31fdh, 31fdh
+ WORD 0082h, 0082h, 0082h, 0082h, 0082h, 0082h, 0082h, 0082h
+ WORD 0642h, 0642h, 0642h, 0642h, 0642h, 0642h, 0642h, 0642h
+ WORD 6682h, 6682h, 6682h, 6682h, 6682h, 6682h, 6682h, 6682h
+ WORD 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h
+ WORD 074fh, 074fh, 074fh, 074fh, 074fh, 074fh, 074fh, 074fh
+ WORD 033dh, 033dh, 033dh, 033dh, 033dh, 033dh, 033dh, 033dh
+ WORD 044fh, 044fh, 044fh, 044fh, 044fh, 044fh, 044fh, 044fh
+ WORD 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh
+ WORD 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh
+ WORD 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh
+ WORD 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh
+ WORD 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh
+ WORD 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h
+ WORD 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h
+ WORD 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h
+ WORD 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h
+ WORD 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h
+ WORD 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h
+ WORD 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h
+ WORD 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h
+ WORD 068ch, 068ch, 01cch, 01cch, 06dbh, 06dbh, 0123h, 0123h
+ WORD 00ebh, 00ebh, 0ab6h, 0ab6h, 0c50h, 0c50h, 0b5bh, 0b5bh
+ WORD 0ea8ch, 0ea8ch, 0a5cch, 0a5cch, 0e7dbh, 0e7dbh, 3a23h, 3a23h
+ WORD 11ebh, 11ebh, 0ccb6h, 0ccb6h, 0fc50h, 0fc50h, 6c5bh, 6c5bh
+ WORD 0c98h, 0c98h, 099ah, 099ah, 06f3h, 06f3h, 04e3h, 04e3h
+ WORD 09b6h, 09b6h, 0b53h, 0b53h, 0ad6h, 0ad6h, 044fh, 044fh
+ WORD 5498h, 5498h, 379ah, 379ah, 0aff3h, 0aff3h, 7de3h, 7de3h
+ WORD 0cbb6h, 0cbb6h, 0d453h, 0d453h, 2cd6h, 2cd6h, 014fh, 014fh
+ WORD 0608h, 0608h, 0608h, 0608h, 011ah, 011ah, 011ah, 011ah
+ WORD 072eh, 072eh, 072eh, 072eh, 050dh, 050dh, 050dh, 050dh
+ WORD 9e08h, 9e08h, 9e08h, 9e08h, 0af1ah, 0af1ah, 0af1ah, 0af1ah
+ WORD 0b12eh, 0b12eh, 0b12eh, 0b12eh, 5c0dh, 5c0dh, 5c0dh, 5c0dh
+ WORD 090ah, 090ah, 090ah, 090ah, 0228h, 0228h, 0228h, 0228h
+ WORD 0a75h, 0a75h, 0a75h, 0a75h, 083ah, 083ah, 083ah, 083ah
+ WORD 870ah, 870ah, 870ah, 870ah, 0fa28h, 0fa28h, 0fa28h, 0fa28h
+ WORD 1975h, 1975h, 1975h, 1975h, 163ah, 163ah, 163ah, 163ah
+ WORD 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h
+ WORD 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h
+ WORD 7182h, 7182h, 7182h, 7182h, 7182h, 7182h, 7182h, 7182h
+ WORD 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h
+ WORD 052dh, 052dh, 052dh, 052dh, 052dh, 052dh, 052dh, 052dh
+ WORD 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h
+ WORD 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh
+ WORD 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h
+ WORD 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h
+ WORD 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h
+ WORD 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h
+ WORD 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h
+ WORD 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh
+ WORD 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh
+ WORD 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh
+ WORD 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh
+ WORD 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch
+ WORD 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch
+ WORD 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch
+ WORD 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch
+ WORD 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h
+ WORD 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h
+ WORD 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h
+ WORD 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h
+ WORD 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h
+ WORD 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h
+ WORD 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h
+ WORD 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h
+ WORD 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h
+ WORD 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h
+ WORD 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h
+ WORD 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h
+ptr_L_mlkem_avx2_zetas_inv QWORD L_mlkem_avx2_zetas_inv
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_keygen_avx2 PROC
+ push r12
+ push r13
+ push r14
+ mov rax, QWORD PTR [rsp+64]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm14, YMMWORD PTR mlkem_q
+ vmovdqu ymm15, YMMWORD PTR mlkem_v
+ mov r13, rcx
+ movsxd r11, eax
+ mov r12, rcx
+L_mlkem_keygen_avx2_priv:
+ ; ntt
+ mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas]
+ vmovdqu ymm10, YMMWORD PTR [r14]
+ vmovdqu ymm12, YMMWORD PTR [r14+32]
+ vmovdqu ymm0, YMMWORD PTR [r12+128]
+ vmovdqu ymm1, YMMWORD PTR [r12+160]
+ vmovdqu ymm2, YMMWORD PTR [r12+192]
+ vmovdqu ymm3, YMMWORD PTR [r12+224]
+ vmovdqu ymm4, YMMWORD PTR [r12+384]
+ vmovdqu ymm5, YMMWORD PTR [r12+416]
+ vmovdqu ymm6, YMMWORD PTR [r12+448]
+ vmovdqu ymm7, YMMWORD PTR [r12+480]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r12+128], ymm0
+ vmovdqu YMMWORD PTR [r12+160], ymm1
+ vmovdqu YMMWORD PTR [r12+192], ymm2
+ vmovdqu YMMWORD PTR [r12+224], ymm3
+ vmovdqu YMMWORD PTR [r12+384], ymm4
+ vmovdqu YMMWORD PTR [r12+416], ymm5
+ vmovdqu YMMWORD PTR [r12+448], ymm6
+ vmovdqu YMMWORD PTR [r12+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
+ vmovdqu ymm4, YMMWORD PTR [r12+256]
+ vmovdqu ymm5, YMMWORD PTR [r12+288]
+ vmovdqu ymm6, YMMWORD PTR [r12+320]
+ vmovdqu ymm7, YMMWORD PTR [r12+352]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r12+256], ymm4
+ vmovdqu YMMWORD PTR [r12+288], ymm5
+ vmovdqu YMMWORD PTR [r12+320], ymm6
+ vmovdqu YMMWORD PTR [r12+352], ymm7
+ vmovdqu ymm4, YMMWORD PTR [r12+128]
+ vmovdqu ymm5, YMMWORD PTR [r12+160]
+ vmovdqu ymm6, YMMWORD PTR [r12+192]
+ vmovdqu ymm7, YMMWORD PTR [r12+224]
+ ; 64: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+64]
+ vmovdqu ymm12, YMMWORD PTR [r14+96]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+128]
+ vmovdqu ymm12, YMMWORD PTR [r14+160]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+192]
+ vmovdqu ymm12, YMMWORD PTR [r14+224]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+256]
+ vmovdqu ymm12, YMMWORD PTR [r14+288]
+ vmovdqu ymm11, YMMWORD PTR [r14+320]
+ vmovdqu ymm13, YMMWORD PTR [r14+352]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+384]
+ vmovdqu ymm12, YMMWORD PTR [r14+416]
+ vmovdqu ymm11, YMMWORD PTR [r14+448]
+ vmovdqu ymm13, YMMWORD PTR [r14+480]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+512]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+544]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+576]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+608]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+640]
+ vmovdqu ymm12, YMMWORD PTR [r14+672]
+ vmovdqu ymm11, YMMWORD PTR [r14+704]
+ vmovdqu ymm13, YMMWORD PTR [r14+736]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+768]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+800]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+832]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+864]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+896]
+ vmovdqu ymm12, YMMWORD PTR [r14+928]
+ vmovdqu ymm11, YMMWORD PTR [r14+960]
+ vmovdqu ymm13, YMMWORD PTR [r14+992]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1024]
+ vmovdqu ymm12, YMMWORD PTR [r14+1056]
+ vmovdqu ymm11, YMMWORD PTR [r14+1088]
+ vmovdqu ymm13, YMMWORD PTR [r14+1120]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1152]
+ vmovdqu ymm12, YMMWORD PTR [r14+1184]
+ vmovdqu ymm11, YMMWORD PTR [r14+1216]
+ vmovdqu ymm13, YMMWORD PTR [r14+1248]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [r12], ymm8
+ vmovdqu YMMWORD PTR [r12+32], ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r12+64], ymm8
+ vmovdqu YMMWORD PTR [r12+96], ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [r12+128], ymm8
+ vmovdqu YMMWORD PTR [r12+160], ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r12+192], ymm8
+ vmovdqu YMMWORD PTR [r12+224], ymm9
+ vmovdqu ymm0, YMMWORD PTR [r12+256]
+ vmovdqu ymm1, YMMWORD PTR [r12+288]
+ vmovdqu ymm2, YMMWORD PTR [r12+320]
+ vmovdqu ymm3, YMMWORD PTR [r12+352]
+ vmovdqu ymm4, YMMWORD PTR [r12+384]
+ vmovdqu ymm5, YMMWORD PTR [r12+416]
+ vmovdqu ymm6, YMMWORD PTR [r12+448]
+ vmovdqu ymm7, YMMWORD PTR [r12+480]
+ ; 64: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1280]
+ vmovdqu ymm12, YMMWORD PTR [r14+1312]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1344]
+ vmovdqu ymm12, YMMWORD PTR [r14+1376]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1408]
+ vmovdqu ymm12, YMMWORD PTR [r14+1440]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1472]
+ vmovdqu ymm12, YMMWORD PTR [r14+1504]
+ vmovdqu ymm11, YMMWORD PTR [r14+1536]
+ vmovdqu ymm13, YMMWORD PTR [r14+1568]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1600]
+ vmovdqu ymm12, YMMWORD PTR [r14+1632]
+ vmovdqu ymm11, YMMWORD PTR [r14+1664]
+ vmovdqu ymm13, YMMWORD PTR [r14+1696]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+1728]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+1760]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+1792]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+1824]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1856]
+ vmovdqu ymm12, YMMWORD PTR [r14+1888]
+ vmovdqu ymm11, YMMWORD PTR [r14+1920]
+ vmovdqu ymm13, YMMWORD PTR [r14+1952]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+1984]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+2016]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+2048]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+2080]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2112]
+ vmovdqu ymm12, YMMWORD PTR [r14+2144]
+ vmovdqu ymm11, YMMWORD PTR [r14+2176]
+ vmovdqu ymm13, YMMWORD PTR [r14+2208]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2240]
+ vmovdqu ymm12, YMMWORD PTR [r14+2272]
+ vmovdqu ymm11, YMMWORD PTR [r14+2304]
+ vmovdqu ymm13, YMMWORD PTR [r14+2336]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2368]
+ vmovdqu ymm12, YMMWORD PTR [r14+2400]
+ vmovdqu ymm11, YMMWORD PTR [r14+2432]
+ vmovdqu ymm13, YMMWORD PTR [r14+2464]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [r12+256], ymm8
+ vmovdqu YMMWORD PTR [r12+288], ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r12+320], ymm8
+ vmovdqu YMMWORD PTR [r12+352], ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [r12+384], ymm8
+ vmovdqu YMMWORD PTR [r12+416], ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r12+448], ymm8
+ vmovdqu YMMWORD PTR [r12+480], ymm9
+ add r12, 512
+ sub r11, 1
+ jg L_mlkem_keygen_avx2_priv
+ vmovdqu ymm13, YMMWORD PTR mlkem_qinv
+ movsxd r10, eax
+ mov r12, rdx
+L_mlkem_keygen_avx2_acc:
+ ; Pointwise acc mont
+ movsxd r11, eax
+ ; Base mul mont
+ mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx]
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14]
+ vmovdqu ymm11, YMMWORD PTR [r14+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12], ymm0
+ vmovdqu YMMWORD PTR [r12+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+64]
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+64]
+ vmovdqu ymm11, YMMWORD PTR [r14+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+64], ymm0
+ vmovdqu YMMWORD PTR [r12+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+128]
+ vmovdqu ymm11, YMMWORD PTR [r14+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+128], ymm0
+ vmovdqu YMMWORD PTR [r12+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+192]
+ vmovdqu ymm5, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+192]
+ vmovdqu ymm11, YMMWORD PTR [r14+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+192], ymm0
+ vmovdqu YMMWORD PTR [r12+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+256]
+ vmovdqu ymm5, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+256]
+ vmovdqu ymm11, YMMWORD PTR [r14+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+256], ymm0
+ vmovdqu YMMWORD PTR [r12+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+320]
+ vmovdqu ymm5, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+320]
+ vmovdqu ymm11, YMMWORD PTR [r14+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+320], ymm0
+ vmovdqu YMMWORD PTR [r12+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+384]
+ vmovdqu ymm11, YMMWORD PTR [r14+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+384], ymm0
+ vmovdqu YMMWORD PTR [r12+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+448]
+ vmovdqu ymm5, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+448]
+ vmovdqu ymm11, YMMWORD PTR [r14+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r12+448], ymm0
+ vmovdqu YMMWORD PTR [r12+480], ymm1
+ add r9, 512
+ add rcx, 512
+ sub r11, 2
+ jz L_pointwise_acc_mont_end_keygen
+L_pointwise_acc_mont_start_keygen:
+ ; Base mul mont add
+ mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx]
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14]
+ vmovdqu ymm11, YMMWORD PTR [r14+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12]
+ vmovdqu ymm7, YMMWORD PTR [r12+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12], ymm0
+ vmovdqu YMMWORD PTR [r12+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+64]
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+64]
+ vmovdqu ymm11, YMMWORD PTR [r14+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+64]
+ vmovdqu ymm7, YMMWORD PTR [r12+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+64], ymm0
+ vmovdqu YMMWORD PTR [r12+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+128]
+ vmovdqu ymm11, YMMWORD PTR [r14+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+128]
+ vmovdqu ymm7, YMMWORD PTR [r12+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+128], ymm0
+ vmovdqu YMMWORD PTR [r12+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+192]
+ vmovdqu ymm5, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+192]
+ vmovdqu ymm11, YMMWORD PTR [r14+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+192]
+ vmovdqu ymm7, YMMWORD PTR [r12+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+192], ymm0
+ vmovdqu YMMWORD PTR [r12+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+256]
+ vmovdqu ymm5, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+256]
+ vmovdqu ymm11, YMMWORD PTR [r14+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+256]
+ vmovdqu ymm7, YMMWORD PTR [r12+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+256], ymm0
+ vmovdqu YMMWORD PTR [r12+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+320]
+ vmovdqu ymm5, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+320]
+ vmovdqu ymm11, YMMWORD PTR [r14+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+320]
+ vmovdqu ymm7, YMMWORD PTR [r12+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+320], ymm0
+ vmovdqu YMMWORD PTR [r12+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+384]
+ vmovdqu ymm11, YMMWORD PTR [r14+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+384]
+ vmovdqu ymm7, YMMWORD PTR [r12+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+384], ymm0
+ vmovdqu YMMWORD PTR [r12+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+448]
+ vmovdqu ymm5, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+448]
+ vmovdqu ymm11, YMMWORD PTR [r14+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+448]
+ vmovdqu ymm7, YMMWORD PTR [r12+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r12+448], ymm0
+ vmovdqu YMMWORD PTR [r12+480], ymm1
+ add r9, 512
+ add rcx, 512
+ sub r11, 1
+ jg L_pointwise_acc_mont_start_keygen
+L_pointwise_acc_mont_end_keygen:
+ ; Base mul mont add
+ mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx]
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14]
+ vmovdqu ymm11, YMMWORD PTR [r14+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12]
+ vmovdqu ymm7, YMMWORD PTR [r12+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12], ymm0
+ vmovdqu YMMWORD PTR [r12+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+64]
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+64]
+ vmovdqu ymm11, YMMWORD PTR [r14+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+64]
+ vmovdqu ymm7, YMMWORD PTR [r12+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+64], ymm0
+ vmovdqu YMMWORD PTR [r12+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+128]
+ vmovdqu ymm11, YMMWORD PTR [r14+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+128]
+ vmovdqu ymm7, YMMWORD PTR [r12+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+128], ymm0
+ vmovdqu YMMWORD PTR [r12+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+192]
+ vmovdqu ymm5, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+192]
+ vmovdqu ymm11, YMMWORD PTR [r14+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+192]
+ vmovdqu ymm7, YMMWORD PTR [r12+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+192], ymm0
+ vmovdqu YMMWORD PTR [r12+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+256]
+ vmovdqu ymm5, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+256]
+ vmovdqu ymm11, YMMWORD PTR [r14+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+256]
+ vmovdqu ymm7, YMMWORD PTR [r12+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+256], ymm0
+ vmovdqu YMMWORD PTR [r12+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+320]
+ vmovdqu ymm5, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+320]
+ vmovdqu ymm11, YMMWORD PTR [r14+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+320]
+ vmovdqu ymm7, YMMWORD PTR [r12+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+320], ymm0
+ vmovdqu YMMWORD PTR [r12+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+384]
+ vmovdqu ymm11, YMMWORD PTR [r14+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+384]
+ vmovdqu ymm7, YMMWORD PTR [r12+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+384], ymm0
+ vmovdqu YMMWORD PTR [r12+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rcx+448]
+ vmovdqu ymm5, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r14+448]
+ vmovdqu ymm11, YMMWORD PTR [r14+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm13
+ vpmullw ymm9, ymm2, ymm13
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r12+448]
+ vmovdqu ymm7, YMMWORD PTR [r12+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r12+448], ymm0
+ vmovdqu YMMWORD PTR [r12+480], ymm1
+ add r9, 512
+ mov rcx, r13
+ add r12, 512
+ sub r10, 1
+ jg L_mlkem_keygen_avx2_acc
+ movsxd r10, eax
+ vmovdqu ymm12, YMMWORD PTR mlkem_f
+ vmovdqu ymm13, YMMWORD PTR mlkem_f_qinv
+ movsxd r10, eax
+ mov r12, rdx
+L_mlkem_keygen_avx2_to_mont:
+ ; To Mont
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
+ vpmullw ymm4, ymm0, ymm13
+ vpmulhw ymm5, ymm0, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm0, ymm5, ymm4
+ vpmullw ymm4, ymm1, ymm13
+ vpmulhw ymm5, ymm1, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm1, ymm5, ymm4
+ vpmullw ymm4, ymm2, ymm13
+ vpmulhw ymm5, ymm2, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm2, ymm5, ymm4
+ vpmullw ymm4, ymm3, ymm13
+ vpmulhw ymm5, ymm3, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm3, ymm5, ymm4
+ vmovdqu YMMWORD PTR [r12], ymm0
+ vmovdqu YMMWORD PTR [r12+32], ymm1
+ vmovdqu YMMWORD PTR [r12+64], ymm2
+ vmovdqu YMMWORD PTR [r12+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+128]
+ vmovdqu ymm1, YMMWORD PTR [r12+160]
+ vmovdqu ymm2, YMMWORD PTR [r12+192]
+ vmovdqu ymm3, YMMWORD PTR [r12+224]
+ vpmullw ymm4, ymm0, ymm13
+ vpmulhw ymm5, ymm0, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm0, ymm5, ymm4
+ vpmullw ymm4, ymm1, ymm13
+ vpmulhw ymm5, ymm1, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm1, ymm5, ymm4
+ vpmullw ymm4, ymm2, ymm13
+ vpmulhw ymm5, ymm2, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm2, ymm5, ymm4
+ vpmullw ymm4, ymm3, ymm13
+ vpmulhw ymm5, ymm3, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm3, ymm5, ymm4
+ vmovdqu YMMWORD PTR [r12+128], ymm0
+ vmovdqu YMMWORD PTR [r12+160], ymm1
+ vmovdqu YMMWORD PTR [r12+192], ymm2
+ vmovdqu YMMWORD PTR [r12+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+256]
+ vmovdqu ymm1, YMMWORD PTR [r12+288]
+ vmovdqu ymm2, YMMWORD PTR [r12+320]
+ vmovdqu ymm3, YMMWORD PTR [r12+352]
+ vpmullw ymm4, ymm0, ymm13
+ vpmulhw ymm5, ymm0, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm0, ymm5, ymm4
+ vpmullw ymm4, ymm1, ymm13
+ vpmulhw ymm5, ymm1, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm1, ymm5, ymm4
+ vpmullw ymm4, ymm2, ymm13
+ vpmulhw ymm5, ymm2, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm2, ymm5, ymm4
+ vpmullw ymm4, ymm3, ymm13
+ vpmulhw ymm5, ymm3, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm3, ymm5, ymm4
+ vmovdqu YMMWORD PTR [r12+256], ymm0
+ vmovdqu YMMWORD PTR [r12+288], ymm1
+ vmovdqu YMMWORD PTR [r12+320], ymm2
+ vmovdqu YMMWORD PTR [r12+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+384]
+ vmovdqu ymm1, YMMWORD PTR [r12+416]
+ vmovdqu ymm2, YMMWORD PTR [r12+448]
+ vmovdqu ymm3, YMMWORD PTR [r12+480]
+ vpmullw ymm4, ymm0, ymm13
+ vpmulhw ymm5, ymm0, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm0, ymm5, ymm4
+ vpmullw ymm4, ymm1, ymm13
+ vpmulhw ymm5, ymm1, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm1, ymm5, ymm4
+ vpmullw ymm4, ymm2, ymm13
+ vpmulhw ymm5, ymm2, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm2, ymm5, ymm4
+ vpmullw ymm4, ymm3, ymm13
+ vpmulhw ymm5, ymm3, ymm12
+ vpmulhw ymm4, ymm4, ymm14
+ vpsubw ymm3, ymm5, ymm4
+ vmovdqu YMMWORD PTR [r12+384], ymm0
+ vmovdqu YMMWORD PTR [r12+416], ymm1
+ vmovdqu YMMWORD PTR [r12+448], ymm2
+ vmovdqu YMMWORD PTR [r12+480], ymm3
+ add r12, 512
+ sub r10, 1
+ jg L_mlkem_keygen_avx2_to_mont
+ movsxd r10, eax
+L_mlkem_keygen_avx2_to_mont_ntt_err:
+ ; ntt
+ mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas]
+ vmovdqu ymm10, YMMWORD PTR [r14]
+ vmovdqu ymm12, YMMWORD PTR [r14+32]
+ vmovdqu ymm0, YMMWORD PTR [r8+128]
+ vmovdqu ymm1, YMMWORD PTR [r8+160]
+ vmovdqu ymm2, YMMWORD PTR [r8+192]
+ vmovdqu ymm3, YMMWORD PTR [r8+224]
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm7, YMMWORD PTR [r8+480]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu YMMWORD PTR [r8+192], ymm2
+ vmovdqu YMMWORD PTR [r8+224], ymm3
+ vmovdqu YMMWORD PTR [r8+384], ymm4
+ vmovdqu YMMWORD PTR [r8+416], ymm5
+ vmovdqu YMMWORD PTR [r8+448], ymm6
+ vmovdqu YMMWORD PTR [r8+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [r8]
+ vmovdqu ymm1, YMMWORD PTR [r8+32]
+ vmovdqu ymm2, YMMWORD PTR [r8+64]
+ vmovdqu ymm3, YMMWORD PTR [r8+96]
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm5, YMMWORD PTR [r8+288]
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm7, YMMWORD PTR [r8+352]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r8+256], ymm4
+ vmovdqu YMMWORD PTR [r8+288], ymm5
+ vmovdqu YMMWORD PTR [r8+320], ymm6
+ vmovdqu YMMWORD PTR [r8+352], ymm7
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm5, YMMWORD PTR [r8+160]
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm7, YMMWORD PTR [r8+224]
+ ; 64: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+64]
+ vmovdqu ymm12, YMMWORD PTR [r14+96]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+128]
+ vmovdqu ymm12, YMMWORD PTR [r14+160]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+192]
+ vmovdqu ymm12, YMMWORD PTR [r14+224]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+256]
+ vmovdqu ymm12, YMMWORD PTR [r14+288]
+ vmovdqu ymm11, YMMWORD PTR [r14+320]
+ vmovdqu ymm13, YMMWORD PTR [r14+352]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+384]
+ vmovdqu ymm12, YMMWORD PTR [r14+416]
+ vmovdqu ymm11, YMMWORD PTR [r14+448]
+ vmovdqu ymm13, YMMWORD PTR [r14+480]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+512]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+544]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+576]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+608]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+640]
+ vmovdqu ymm12, YMMWORD PTR [r14+672]
+ vmovdqu ymm11, YMMWORD PTR [r14+704]
+ vmovdqu ymm13, YMMWORD PTR [r14+736]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+768]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+800]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+832]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+864]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+896]
+ vmovdqu ymm12, YMMWORD PTR [r14+928]
+ vmovdqu ymm11, YMMWORD PTR [r14+960]
+ vmovdqu ymm13, YMMWORD PTR [r14+992]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1024]
+ vmovdqu ymm12, YMMWORD PTR [r14+1056]
+ vmovdqu ymm11, YMMWORD PTR [r14+1088]
+ vmovdqu ymm13, YMMWORD PTR [r14+1120]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1152]
+ vmovdqu ymm12, YMMWORD PTR [r14+1184]
+ vmovdqu ymm11, YMMWORD PTR [r14+1216]
+ vmovdqu ymm13, YMMWORD PTR [r14+1248]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu ymm8, YMMWORD PTR [rdx]
+ vmovdqu ymm9, YMMWORD PTR [rdx+32]
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [rdx], ymm8
+ vmovdqu YMMWORD PTR [rdx+32], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+64]
+ vmovdqu ymm9, YMMWORD PTR [rdx+96]
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [rdx+64], ymm8
+ vmovdqu YMMWORD PTR [rdx+96], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+128]
+ vmovdqu ymm9, YMMWORD PTR [rdx+160]
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [rdx+128], ymm8
+ vmovdqu YMMWORD PTR [rdx+160], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+192]
+ vmovdqu ymm9, YMMWORD PTR [rdx+224]
+ vpaddw ymm6, ymm6, ymm8
+ vpaddw ymm7, ymm7, ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx+192], ymm8
+ vmovdqu YMMWORD PTR [rdx+224], ymm9
+ vmovdqu ymm0, YMMWORD PTR [r8+256]
+ vmovdqu ymm1, YMMWORD PTR [r8+288]
+ vmovdqu ymm2, YMMWORD PTR [r8+320]
+ vmovdqu ymm3, YMMWORD PTR [r8+352]
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm7, YMMWORD PTR [r8+480]
+ ; 64: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1280]
+ vmovdqu ymm12, YMMWORD PTR [r14+1312]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1344]
+ vmovdqu ymm12, YMMWORD PTR [r14+1376]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1408]
+ vmovdqu ymm12, YMMWORD PTR [r14+1440]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1472]
+ vmovdqu ymm12, YMMWORD PTR [r14+1504]
+ vmovdqu ymm11, YMMWORD PTR [r14+1536]
+ vmovdqu ymm13, YMMWORD PTR [r14+1568]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1600]
+ vmovdqu ymm12, YMMWORD PTR [r14+1632]
+ vmovdqu ymm11, YMMWORD PTR [r14+1664]
+ vmovdqu ymm13, YMMWORD PTR [r14+1696]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+1728]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+1760]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+1792]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+1824]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+1856]
+ vmovdqu ymm12, YMMWORD PTR [r14+1888]
+ vmovdqu ymm11, YMMWORD PTR [r14+1920]
+ vmovdqu ymm13, YMMWORD PTR [r14+1952]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r14+1984]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r14+2016]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r14+2048]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r14+2080]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2112]
+ vmovdqu ymm12, YMMWORD PTR [r14+2144]
+ vmovdqu ymm11, YMMWORD PTR [r14+2176]
+ vmovdqu ymm13, YMMWORD PTR [r14+2208]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2240]
+ vmovdqu ymm12, YMMWORD PTR [r14+2272]
+ vmovdqu ymm11, YMMWORD PTR [r14+2304]
+ vmovdqu ymm13, YMMWORD PTR [r14+2336]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r14+2368]
+ vmovdqu ymm12, YMMWORD PTR [r14+2400]
+ vmovdqu ymm11, YMMWORD PTR [r14+2432]
+ vmovdqu ymm13, YMMWORD PTR [r14+2464]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu ymm8, YMMWORD PTR [rdx+256]
+ vmovdqu ymm9, YMMWORD PTR [rdx+288]
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [rdx+256], ymm8
+ vmovdqu YMMWORD PTR [rdx+288], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+320]
+ vmovdqu ymm9, YMMWORD PTR [rdx+352]
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [rdx+320], ymm8
+ vmovdqu YMMWORD PTR [rdx+352], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+384]
+ vmovdqu ymm9, YMMWORD PTR [rdx+416]
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [rdx+384], ymm8
+ vmovdqu YMMWORD PTR [rdx+416], ymm9
+ vmovdqu ymm8, YMMWORD PTR [rdx+448]
+ vmovdqu ymm9, YMMWORD PTR [rdx+480]
+ vpaddw ymm6, ymm6, ymm8
+ vpaddw ymm7, ymm7, ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx+448], ymm8
+ vmovdqu YMMWORD PTR [rdx+480], ymm9
+ add r8, 512
+ add rdx, 512
+ sub r10, 1
+ jg L_mlkem_keygen_avx2_to_mont_ntt_err
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r14
+ pop r13
+ pop r12
+ ret
+mlkem_keygen_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_encapsulate_avx2 PROC
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov rax, QWORD PTR [rsp+96]
+ mov r10, QWORD PTR [rsp+104]
+ mov r11, QWORD PTR [rsp+112]
+ mov r12, QWORD PTR [rsp+120]
+ mov r13, QWORD PTR [rsp+128]
+ sub rsp, 208
+ vmovdqu OWORD PTR [rsp+48], xmm6
+ vmovdqu OWORD PTR [rsp+64], xmm7
+ vmovdqu OWORD PTR [rsp+80], xmm8
+ vmovdqu OWORD PTR [rsp+96], xmm9
+ vmovdqu OWORD PTR [rsp+112], xmm10
+ vmovdqu OWORD PTR [rsp+128], xmm11
+ vmovdqu OWORD PTR [rsp+144], xmm12
+ vmovdqu OWORD PTR [rsp+160], xmm13
+ vmovdqu OWORD PTR [rsp+176], xmm14
+ vmovdqu OWORD PTR [rsp+192], xmm15
+ vmovdqu ymm14, YMMWORD PTR mlkem_q
+ vmovdqu ymm15, YMMWORD PTR mlkem_v
+ mov rsi, rax
+ movsxd r15, r13d
+ mov rdi, rax
+L_mlkem_encapsulate_avx2_trans:
+ ; ntt
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas]
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm12, YMMWORD PTR [rbx+32]
+ vmovdqu ymm0, YMMWORD PTR [rdi+128]
+ vmovdqu ymm1, YMMWORD PTR [rdi+160]
+ vmovdqu ymm2, YMMWORD PTR [rdi+192]
+ vmovdqu ymm3, YMMWORD PTR [rdi+224]
+ vmovdqu ymm4, YMMWORD PTR [rdi+384]
+ vmovdqu ymm5, YMMWORD PTR [rdi+416]
+ vmovdqu ymm6, YMMWORD PTR [rdi+448]
+ vmovdqu ymm7, YMMWORD PTR [rdi+480]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [rdi+128], ymm0
+ vmovdqu YMMWORD PTR [rdi+160], ymm1
+ vmovdqu YMMWORD PTR [rdi+192], ymm2
+ vmovdqu YMMWORD PTR [rdi+224], ymm3
+ vmovdqu YMMWORD PTR [rdi+384], ymm4
+ vmovdqu YMMWORD PTR [rdi+416], ymm5
+ vmovdqu YMMWORD PTR [rdi+448], ymm6
+ vmovdqu YMMWORD PTR [rdi+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdi]
+ vmovdqu ymm1, YMMWORD PTR [rdi+32]
+ vmovdqu ymm2, YMMWORD PTR [rdi+64]
+ vmovdqu ymm3, YMMWORD PTR [rdi+96]
+ vmovdqu ymm4, YMMWORD PTR [rdi+256]
+ vmovdqu ymm5, YMMWORD PTR [rdi+288]
+ vmovdqu ymm6, YMMWORD PTR [rdi+320]
+ vmovdqu ymm7, YMMWORD PTR [rdi+352]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [rdi+256], ymm4
+ vmovdqu YMMWORD PTR [rdi+288], ymm5
+ vmovdqu YMMWORD PTR [rdi+320], ymm6
+ vmovdqu YMMWORD PTR [rdi+352], ymm7
+ vmovdqu ymm4, YMMWORD PTR [rdi+128]
+ vmovdqu ymm5, YMMWORD PTR [rdi+160]
+ vmovdqu ymm6, YMMWORD PTR [rdi+192]
+ vmovdqu ymm7, YMMWORD PTR [rdi+224]
+ ; 64: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm12, YMMWORD PTR [rbx+96]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm12, YMMWORD PTR [rbx+160]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm12, YMMWORD PTR [rbx+224]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm12, YMMWORD PTR [rbx+288]
+ vmovdqu ymm11, YMMWORD PTR [rbx+320]
+ vmovdqu ymm13, YMMWORD PTR [rbx+352]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm12, YMMWORD PTR [rbx+416]
+ vmovdqu ymm11, YMMWORD PTR [rbx+448]
+ vmovdqu ymm13, YMMWORD PTR [rbx+480]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+512]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+544]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+576]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+608]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+640]
+ vmovdqu ymm12, YMMWORD PTR [rbx+672]
+ vmovdqu ymm11, YMMWORD PTR [rbx+704]
+ vmovdqu ymm13, YMMWORD PTR [rbx+736]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+768]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+800]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+832]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+864]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+896]
+ vmovdqu ymm12, YMMWORD PTR [rbx+928]
+ vmovdqu ymm11, YMMWORD PTR [rbx+960]
+ vmovdqu ymm13, YMMWORD PTR [rbx+992]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1024]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1056]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1088]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1120]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1184]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1216]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1248]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rdi], ymm0
+ vmovdqu YMMWORD PTR [rdi+32], ymm1
+ vmovdqu YMMWORD PTR [rdi+64], ymm2
+ vmovdqu YMMWORD PTR [rdi+96], ymm3
+ vmovdqu YMMWORD PTR [rdi+128], ymm4
+ vmovdqu YMMWORD PTR [rdi+160], ymm5
+ vmovdqu YMMWORD PTR [rdi+192], ymm6
+ vmovdqu YMMWORD PTR [rdi+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdi+256]
+ vmovdqu ymm1, YMMWORD PTR [rdi+288]
+ vmovdqu ymm2, YMMWORD PTR [rdi+320]
+ vmovdqu ymm3, YMMWORD PTR [rdi+352]
+ vmovdqu ymm4, YMMWORD PTR [rdi+384]
+ vmovdqu ymm5, YMMWORD PTR [rdi+416]
+ vmovdqu ymm6, YMMWORD PTR [rdi+448]
+ vmovdqu ymm7, YMMWORD PTR [rdi+480]
+ ; 64: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1280]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1312]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1344]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1376]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1408]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1440]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1568]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1600]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1632]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1664]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1696]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1728]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1760]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1792]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1824]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+1856]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1888]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1920]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1952]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1984]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+2016]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+2048]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+2080]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+2112]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2144]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2176]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2208]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+2240]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2272]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2304]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2336]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [rbx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2400]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2432]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2464]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vmovdqu YMMWORD PTR [rdi+256], ymm0
+ vmovdqu YMMWORD PTR [rdi+288], ymm1
+ vmovdqu YMMWORD PTR [rdi+320], ymm2
+ vmovdqu YMMWORD PTR [rdi+352], ymm3
+ vmovdqu YMMWORD PTR [rdi+384], ymm4
+ vmovdqu YMMWORD PTR [rdi+416], ymm5
+ vmovdqu YMMWORD PTR [rdi+448], ymm6
+ vmovdqu YMMWORD PTR [rdi+480], ymm7
+ add rdi, 512
+ sub r15, 1
+ jg L_mlkem_encapsulate_avx2_trans
+ movsxd r14, r13d
+L_mlkem_encapsulate_avx2_calc:
+ vmovdqu ymm12, YMMWORD PTR mlkem_qinv
+ ; Pointwise acc mont
+ movsxd r15, r13d
+ ; Base mul mont
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add r9, 512
+ add rax, 512
+ sub r15, 2
+ jz L_pointwise_acc_mont_end_encap_bp
+L_pointwise_acc_mont_start_encap_bp:
+ ; Base mul mont add
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx]
+ vmovdqu ymm7, YMMWORD PTR [rdx+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+64]
+ vmovdqu ymm7, YMMWORD PTR [rdx+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+128]
+ vmovdqu ymm7, YMMWORD PTR [rdx+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+256]
+ vmovdqu ymm7, YMMWORD PTR [rdx+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+384]
+ vmovdqu ymm7, YMMWORD PTR [rdx+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add r9, 512
+ add rax, 512
+ sub r15, 1
+ jg L_pointwise_acc_mont_start_encap_bp
+L_pointwise_acc_mont_end_encap_bp:
+ ; Base mul mont add
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [r9]
+ vmovdqu ymm3, YMMWORD PTR [r9+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx]
+ vmovdqu ymm7, YMMWORD PTR [rdx+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+64]
+ vmovdqu ymm3, YMMWORD PTR [r9+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+64]
+ vmovdqu ymm7, YMMWORD PTR [rdx+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+128]
+ vmovdqu ymm3, YMMWORD PTR [r9+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+128]
+ vmovdqu ymm7, YMMWORD PTR [rdx+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+192]
+ vmovdqu ymm3, YMMWORD PTR [r9+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+256]
+ vmovdqu ymm3, YMMWORD PTR [r9+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+256]
+ vmovdqu ymm7, YMMWORD PTR [rdx+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+320]
+ vmovdqu ymm3, YMMWORD PTR [r9+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+384]
+ vmovdqu ymm3, YMMWORD PTR [r9+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+384]
+ vmovdqu ymm7, YMMWORD PTR [rdx+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [r9+448]
+ vmovdqu ymm3, YMMWORD PTR [r9+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add r9, 512
+ mov rax, rsi
+ ; invntt
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv]
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+128]
+ vmovdqu ymm5, YMMWORD PTR [rdx+160]
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+32]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+64]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+96]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm12, YMMWORD PTR [rbx+160]
+ vmovdqu ymm11, YMMWORD PTR [rbx+192]
+ vmovdqu ymm13, YMMWORD PTR [rbx+224]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm12, YMMWORD PTR [rbx+288]
+ vmovdqu ymm11, YMMWORD PTR [rbx+320]
+ vmovdqu ymm13, YMMWORD PTR [rbx+352]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 1/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+416]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+448]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+480]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+512]
+ vmovdqu ymm12, YMMWORD PTR [rbx+544]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+576]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+608]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+640]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+672]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+704]
+ vmovdqu ymm12, YMMWORD PTR [rbx+736]
+ vmovdqu ymm11, YMMWORD PTR [rbx+768]
+ vmovdqu ymm13, YMMWORD PTR [rbx+800]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+832]
+ vmovdqu ymm12, YMMWORD PTR [rbx+864]
+ vmovdqu ymm11, YMMWORD PTR [rbx+896]
+ vmovdqu ymm13, YMMWORD PTR [rbx+928]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 1/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+960]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+992]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1024]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1056]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1088]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1120]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1184]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu YMMWORD PTR [rdx+192], ymm6
+ vmovdqu YMMWORD PTR [rdx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [rdx+384]
+ vmovdqu ymm5, YMMWORD PTR [rdx+416]
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1216]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1248]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1280]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1312]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1344]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1376]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1408]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1440]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1568]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 2/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1600]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1632]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1664]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1696]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1760]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1792]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1824]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1856]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1888]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1920]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1952]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1984]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2016]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2048]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2080]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2112]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2144]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 2/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+2176]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+2208]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+2240]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+2272]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2304]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2336]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2400]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm2
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ ; 128
+ vmovdqu ymm10, YMMWORD PTR [rbx+2432]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2464]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2496]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2528]
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+192]
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu YMMWORD PTR [rdx+192], ymm2
+ vmovdqu YMMWORD PTR [rdx+224], ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu YMMWORD PTR [rdx+448], ymm6
+ vmovdqu YMMWORD PTR [rdx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+256]
+ vmovdqu ymm5, YMMWORD PTR [rdx+288]
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ vmovdqu YMMWORD PTR [rdx+320], ymm6
+ vmovdqu YMMWORD PTR [rdx+352], ymm7
+ ; Add Errors
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [r10]
+ vmovdqu ymm5, YMMWORD PTR [r10+32]
+ vmovdqu ymm6, YMMWORD PTR [r10+64]
+ vmovdqu ymm7, YMMWORD PTR [r10+96]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+192]
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
+ vmovdqu ymm4, YMMWORD PTR [r10+128]
+ vmovdqu ymm5, YMMWORD PTR [r10+160]
+ vmovdqu ymm6, YMMWORD PTR [r10+192]
+ vmovdqu ymm7, YMMWORD PTR [r10+224]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu YMMWORD PTR [rdx+192], ymm2
+ vmovdqu YMMWORD PTR [rdx+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [r10+256]
+ vmovdqu ymm5, YMMWORD PTR [r10+288]
+ vmovdqu ymm6, YMMWORD PTR [r10+320]
+ vmovdqu ymm7, YMMWORD PTR [r10+352]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm2
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm1, YMMWORD PTR [rdx+416]
+ vmovdqu ymm2, YMMWORD PTR [rdx+448]
+ vmovdqu ymm3, YMMWORD PTR [rdx+480]
+ vmovdqu ymm4, YMMWORD PTR [r10+384]
+ vmovdqu ymm5, YMMWORD PTR [r10+416]
+ vmovdqu ymm6, YMMWORD PTR [r10+448]
+ vmovdqu ymm7, YMMWORD PTR [r10+480]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu YMMWORD PTR [rdx+448], ymm2
+ vmovdqu YMMWORD PTR [rdx+480], ymm3
+ add r10, 512
+ add rdx, 512
+ sub r14, 1
+ jg L_mlkem_encapsulate_avx2_calc
+ vmovdqu ymm12, YMMWORD PTR mlkem_qinv
+ ; Pointwise acc mont
+ movsxd r15, r13d
+ ; Base mul mont
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+192], ymm0
+ vmovdqu YMMWORD PTR [r8+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+256], ymm0
+ vmovdqu YMMWORD PTR [r8+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+320], ymm0
+ vmovdqu YMMWORD PTR [r8+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+384], ymm0
+ vmovdqu YMMWORD PTR [r8+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [r8+448], ymm0
+ vmovdqu YMMWORD PTR [r8+480], ymm1
+ add rcx, 512
+ add rax, 512
+ sub r15, 2
+ jz L_pointwise_acc_mont_end_encap_v
+L_pointwise_acc_mont_start_encap_v:
+ ; Base mul mont add
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8]
+ vmovdqu ymm7, YMMWORD PTR [r8+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+64]
+ vmovdqu ymm7, YMMWORD PTR [r8+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+128]
+ vmovdqu ymm7, YMMWORD PTR [r8+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm7, YMMWORD PTR [r8+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+192], ymm0
+ vmovdqu YMMWORD PTR [r8+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+256]
+ vmovdqu ymm7, YMMWORD PTR [r8+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+256], ymm0
+ vmovdqu YMMWORD PTR [r8+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm7, YMMWORD PTR [r8+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+320], ymm0
+ vmovdqu YMMWORD PTR [r8+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+384]
+ vmovdqu ymm7, YMMWORD PTR [r8+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+384], ymm0
+ vmovdqu YMMWORD PTR [r8+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm7, YMMWORD PTR [r8+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [r8+448], ymm0
+ vmovdqu YMMWORD PTR [r8+480], ymm1
+ add rcx, 512
+ add rax, 512
+ sub r15, 1
+ jg L_pointwise_acc_mont_start_encap_v
+L_pointwise_acc_mont_end_encap_v:
+ ; Base mul mont add
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax]
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vmovdqu ymm11, YMMWORD PTR [rbx+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8]
+ vmovdqu ymm7, YMMWORD PTR [r8+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+64]
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+64]
+ vmovdqu ymm11, YMMWORD PTR [rbx+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+64]
+ vmovdqu ymm7, YMMWORD PTR [r8+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+64], ymm0
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+128]
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm11, YMMWORD PTR [rbx+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+128]
+ vmovdqu ymm7, YMMWORD PTR [r8+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+192]
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+192]
+ vmovdqu ymm11, YMMWORD PTR [rbx+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm7, YMMWORD PTR [r8+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+192], ymm0
+ vmovdqu YMMWORD PTR [r8+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+256]
+ vmovdqu ymm5, YMMWORD PTR [rax+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm11, YMMWORD PTR [rbx+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+256]
+ vmovdqu ymm7, YMMWORD PTR [r8+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+256], ymm0
+ vmovdqu YMMWORD PTR [r8+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+320]
+ vmovdqu ymm5, YMMWORD PTR [rax+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+320]
+ vmovdqu ymm11, YMMWORD PTR [rbx+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm7, YMMWORD PTR [r8+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+320], ymm0
+ vmovdqu YMMWORD PTR [r8+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+384]
+ vmovdqu ymm5, YMMWORD PTR [rax+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vmovdqu ymm11, YMMWORD PTR [rbx+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+384]
+ vmovdqu ymm7, YMMWORD PTR [r8+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+384], ymm0
+ vmovdqu YMMWORD PTR [r8+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [rax+448]
+ vmovdqu ymm5, YMMWORD PTR [rax+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [rbx+448]
+ vmovdqu ymm11, YMMWORD PTR [rbx+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm7, YMMWORD PTR [r8+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [r8+448], ymm0
+ vmovdqu YMMWORD PTR [r8+480], ymm1
+ add rcx, 512
+ ; invntt
+ mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv]
+ vmovdqu ymm0, YMMWORD PTR [r8]
+ vmovdqu ymm1, YMMWORD PTR [r8+32]
+ vmovdqu ymm2, YMMWORD PTR [r8+64]
+ vmovdqu ymm3, YMMWORD PTR [r8+96]
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm5, YMMWORD PTR [r8+160]
+ vmovdqu ymm6, YMMWORD PTR [r8+192]
+ vmovdqu ymm7, YMMWORD PTR [r8+224]
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+32]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+64]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+96]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+128]
+ vmovdqu ymm12, YMMWORD PTR [rbx+160]
+ vmovdqu ymm11, YMMWORD PTR [rbx+192]
+ vmovdqu ymm13, YMMWORD PTR [rbx+224]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+256]
+ vmovdqu ymm12, YMMWORD PTR [rbx+288]
+ vmovdqu ymm11, YMMWORD PTR [rbx+320]
+ vmovdqu ymm13, YMMWORD PTR [rbx+352]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 1/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+384]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+416]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+448]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+480]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+512]
+ vmovdqu ymm12, YMMWORD PTR [rbx+544]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+576]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+608]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+640]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+672]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+704]
+ vmovdqu ymm12, YMMWORD PTR [rbx+736]
+ vmovdqu ymm11, YMMWORD PTR [rbx+768]
+ vmovdqu ymm13, YMMWORD PTR [rbx+800]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+832]
+ vmovdqu ymm12, YMMWORD PTR [rbx+864]
+ vmovdqu ymm11, YMMWORD PTR [rbx+896]
+ vmovdqu ymm13, YMMWORD PTR [rbx+928]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 1/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+960]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+992]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1024]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1056]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1088]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1120]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 1/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1152]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1184]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm4
+ vmovdqu YMMWORD PTR [r8+160], ymm5
+ vmovdqu YMMWORD PTR [r8+192], ymm6
+ vmovdqu YMMWORD PTR [r8+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [r8+256]
+ vmovdqu ymm1, YMMWORD PTR [r8+288]
+ vmovdqu ymm2, YMMWORD PTR [r8+320]
+ vmovdqu ymm3, YMMWORD PTR [r8+352]
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vmovdqu ymm6, YMMWORD PTR [r8+448]
+ vmovdqu ymm7, YMMWORD PTR [r8+480]
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1216]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1248]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1280]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1312]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1344]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1376]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1408]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1440]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1472]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1504]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1536]
+ vmovdqu ymm13, YMMWORD PTR [rbx+1568]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 2/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1600]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1632]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1664]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1696]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1728]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1760]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+1792]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+1824]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+1856]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+1888]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+1920]
+ vmovdqu ymm12, YMMWORD PTR [rbx+1952]
+ vmovdqu ymm11, YMMWORD PTR [rbx+1984]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2016]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2048]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2080]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2112]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2144]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 2/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [rbx+2176]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [rbx+2208]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [rbx+2240]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [rbx+2272]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2304]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2336]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 2/2
+ vmovdqu ymm10, YMMWORD PTR [rbx+2368]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2400]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [r8+256], ymm0
+ vmovdqu YMMWORD PTR [r8+288], ymm1
+ vmovdqu YMMWORD PTR [r8+320], ymm2
+ vmovdqu YMMWORD PTR [r8+352], ymm3
+ ; 128
+ vmovdqu ymm10, YMMWORD PTR [rbx+2432]
+ vmovdqu ymm12, YMMWORD PTR [rbx+2464]
+ vmovdqu ymm11, YMMWORD PTR [rbx+2496]
+ vmovdqu ymm13, YMMWORD PTR [rbx+2528]
+ vmovdqu ymm0, YMMWORD PTR [r8+128]
+ vmovdqu ymm1, YMMWORD PTR [r8+160]
+ vmovdqu ymm2, YMMWORD PTR [r8+192]
+ vmovdqu ymm3, YMMWORD PTR [r8+224]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu YMMWORD PTR [r8+192], ymm2
+ vmovdqu YMMWORD PTR [r8+224], ymm3
+ vmovdqu YMMWORD PTR [r8+384], ymm4
+ vmovdqu YMMWORD PTR [r8+416], ymm5
+ vmovdqu YMMWORD PTR [r8+448], ymm6
+ vmovdqu YMMWORD PTR [r8+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [r8]
+ vmovdqu ymm1, YMMWORD PTR [r8+32]
+ vmovdqu ymm2, YMMWORD PTR [r8+64]
+ vmovdqu ymm3, YMMWORD PTR [r8+96]
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm5, YMMWORD PTR [r8+288]
+ vmovdqu ymm6, YMMWORD PTR [r8+320]
+ vmovdqu ymm7, YMMWORD PTR [r8+352]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu YMMWORD PTR [r8+256], ymm4
+ vmovdqu YMMWORD PTR [r8+288], ymm5
+ vmovdqu YMMWORD PTR [r8+320], ymm6
+ vmovdqu YMMWORD PTR [r8+352], ymm7
+ ; Add Errors
+ vmovdqu ymm0, YMMWORD PTR [r12]
+ vmovdqu ymm1, YMMWORD PTR [r12+32]
+ vmovdqu ymm2, YMMWORD PTR [r12+64]
+ vmovdqu ymm3, YMMWORD PTR [r12+96]
+ vmovdqu ymm4, YMMWORD PTR [r11]
+ vmovdqu ymm5, YMMWORD PTR [r11+32]
+ vmovdqu ymm6, YMMWORD PTR [r11+64]
+ vmovdqu ymm7, YMMWORD PTR [r11+96]
+ vpaddw ymm4, ymm4, ymm0
+ vpaddw ymm5, ymm5, ymm1
+ vpaddw ymm6, ymm6, ymm2
+ vpaddw ymm7, ymm7, ymm3
+ vmovdqu ymm0, YMMWORD PTR [r8]
+ vmovdqu ymm1, YMMWORD PTR [r8+32]
+ vmovdqu ymm2, YMMWORD PTR [r8+64]
+ vmovdqu ymm3, YMMWORD PTR [r8+96]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [r8], ymm0
+ vmovdqu YMMWORD PTR [r8+32], ymm1
+ vmovdqu YMMWORD PTR [r8+64], ymm2
+ vmovdqu YMMWORD PTR [r8+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+128]
+ vmovdqu ymm1, YMMWORD PTR [r12+160]
+ vmovdqu ymm2, YMMWORD PTR [r12+192]
+ vmovdqu ymm3, YMMWORD PTR [r12+224]
+ vmovdqu ymm4, YMMWORD PTR [r11+128]
+ vmovdqu ymm5, YMMWORD PTR [r11+160]
+ vmovdqu ymm6, YMMWORD PTR [r11+192]
+ vmovdqu ymm7, YMMWORD PTR [r11+224]
+ vpaddw ymm4, ymm4, ymm0
+ vpaddw ymm5, ymm5, ymm1
+ vpaddw ymm6, ymm6, ymm2
+ vpaddw ymm7, ymm7, ymm3
+ vmovdqu ymm0, YMMWORD PTR [r8+128]
+ vmovdqu ymm1, YMMWORD PTR [r8+160]
+ vmovdqu ymm2, YMMWORD PTR [r8+192]
+ vmovdqu ymm3, YMMWORD PTR [r8+224]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [r8+128], ymm0
+ vmovdqu YMMWORD PTR [r8+160], ymm1
+ vmovdqu YMMWORD PTR [r8+192], ymm2
+ vmovdqu YMMWORD PTR [r8+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+256]
+ vmovdqu ymm1, YMMWORD PTR [r12+288]
+ vmovdqu ymm2, YMMWORD PTR [r12+320]
+ vmovdqu ymm3, YMMWORD PTR [r12+352]
+ vmovdqu ymm4, YMMWORD PTR [r11+256]
+ vmovdqu ymm5, YMMWORD PTR [r11+288]
+ vmovdqu ymm6, YMMWORD PTR [r11+320]
+ vmovdqu ymm7, YMMWORD PTR [r11+352]
+ vpaddw ymm4, ymm4, ymm0
+ vpaddw ymm5, ymm5, ymm1
+ vpaddw ymm6, ymm6, ymm2
+ vpaddw ymm7, ymm7, ymm3
+ vmovdqu ymm0, YMMWORD PTR [r8+256]
+ vmovdqu ymm1, YMMWORD PTR [r8+288]
+ vmovdqu ymm2, YMMWORD PTR [r8+320]
+ vmovdqu ymm3, YMMWORD PTR [r8+352]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [r8+256], ymm0
+ vmovdqu YMMWORD PTR [r8+288], ymm1
+ vmovdqu YMMWORD PTR [r8+320], ymm2
+ vmovdqu YMMWORD PTR [r8+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [r12+384]
+ vmovdqu ymm1, YMMWORD PTR [r12+416]
+ vmovdqu ymm2, YMMWORD PTR [r12+448]
+ vmovdqu ymm3, YMMWORD PTR [r12+480]
+ vmovdqu ymm4, YMMWORD PTR [r11+384]
+ vmovdqu ymm5, YMMWORD PTR [r11+416]
+ vmovdqu ymm6, YMMWORD PTR [r11+448]
+ vmovdqu ymm7, YMMWORD PTR [r11+480]
+ vpaddw ymm4, ymm4, ymm0
+ vpaddw ymm5, ymm5, ymm1
+ vpaddw ymm6, ymm6, ymm2
+ vpaddw ymm7, ymm7, ymm3
+ vmovdqu ymm0, YMMWORD PTR [r8+384]
+ vmovdqu ymm1, YMMWORD PTR [r8+416]
+ vmovdqu ymm2, YMMWORD PTR [r8+448]
+ vmovdqu ymm3, YMMWORD PTR [r8+480]
+ vpaddw ymm4, ymm0, ymm4
+ vpaddw ymm5, ymm1, ymm5
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpaddw ymm6, ymm2, ymm6
+ vpaddw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [r8+384], ymm0
+ vmovdqu YMMWORD PTR [r8+416], ymm1
+ vmovdqu YMMWORD PTR [r8+448], ymm2
+ vmovdqu YMMWORD PTR [r8+480], ymm3
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+48]
+ vmovdqu xmm7, OWORD PTR [rsp+64]
+ vmovdqu xmm8, OWORD PTR [rsp+80]
+ vmovdqu xmm9, OWORD PTR [rsp+96]
+ vmovdqu xmm10, OWORD PTR [rsp+112]
+ vmovdqu xmm11, OWORD PTR [rsp+128]
+ vmovdqu xmm12, OWORD PTR [rsp+144]
+ vmovdqu xmm13, OWORD PTR [rsp+160]
+ vmovdqu xmm14, OWORD PTR [rsp+176]
+ vmovdqu xmm15, OWORD PTR [rsp+192]
+ add rsp, 208
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+mlkem_encapsulate_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_decapsulate_avx2 PROC
+ push r12
+ mov rax, QWORD PTR [rsp+48]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm14, YMMWORD PTR mlkem_q
+ vmovdqu ymm15, YMMWORD PTR mlkem_v
+ movsxd r10, eax
+ mov r11, r8
+L_mlkem_decapsulate_avx2_trans:
+ ; ntt
+ mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas]
+ vmovdqu ymm10, YMMWORD PTR [r12]
+ vmovdqu ymm12, YMMWORD PTR [r12+32]
+ vmovdqu ymm0, YMMWORD PTR [r11+128]
+ vmovdqu ymm1, YMMWORD PTR [r11+160]
+ vmovdqu ymm2, YMMWORD PTR [r11+192]
+ vmovdqu ymm3, YMMWORD PTR [r11+224]
+ vmovdqu ymm4, YMMWORD PTR [r11+384]
+ vmovdqu ymm5, YMMWORD PTR [r11+416]
+ vmovdqu ymm6, YMMWORD PTR [r11+448]
+ vmovdqu ymm7, YMMWORD PTR [r11+480]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r11+128], ymm0
+ vmovdqu YMMWORD PTR [r11+160], ymm1
+ vmovdqu YMMWORD PTR [r11+192], ymm2
+ vmovdqu YMMWORD PTR [r11+224], ymm3
+ vmovdqu YMMWORD PTR [r11+384], ymm4
+ vmovdqu YMMWORD PTR [r11+416], ymm5
+ vmovdqu YMMWORD PTR [r11+448], ymm6
+ vmovdqu YMMWORD PTR [r11+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [r11]
+ vmovdqu ymm1, YMMWORD PTR [r11+32]
+ vmovdqu ymm2, YMMWORD PTR [r11+64]
+ vmovdqu ymm3, YMMWORD PTR [r11+96]
+ vmovdqu ymm4, YMMWORD PTR [r11+256]
+ vmovdqu ymm5, YMMWORD PTR [r11+288]
+ vmovdqu ymm6, YMMWORD PTR [r11+320]
+ vmovdqu ymm7, YMMWORD PTR [r11+352]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r11+256], ymm4
+ vmovdqu YMMWORD PTR [r11+288], ymm5
+ vmovdqu YMMWORD PTR [r11+320], ymm6
+ vmovdqu YMMWORD PTR [r11+352], ymm7
+ vmovdqu ymm4, YMMWORD PTR [r11+128]
+ vmovdqu ymm5, YMMWORD PTR [r11+160]
+ vmovdqu ymm6, YMMWORD PTR [r11+192]
+ vmovdqu ymm7, YMMWORD PTR [r11+224]
+ ; 64: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+64]
+ vmovdqu ymm12, YMMWORD PTR [r12+96]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+128]
+ vmovdqu ymm12, YMMWORD PTR [r12+160]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+192]
+ vmovdqu ymm12, YMMWORD PTR [r12+224]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+256]
+ vmovdqu ymm12, YMMWORD PTR [r12+288]
+ vmovdqu ymm11, YMMWORD PTR [r12+320]
+ vmovdqu ymm13, YMMWORD PTR [r12+352]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+384]
+ vmovdqu ymm12, YMMWORD PTR [r12+416]
+ vmovdqu ymm11, YMMWORD PTR [r12+448]
+ vmovdqu ymm13, YMMWORD PTR [r12+480]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+512]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+544]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+576]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+608]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+640]
+ vmovdqu ymm12, YMMWORD PTR [r12+672]
+ vmovdqu ymm11, YMMWORD PTR [r12+704]
+ vmovdqu ymm13, YMMWORD PTR [r12+736]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 0/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+768]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+800]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+832]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+864]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+896]
+ vmovdqu ymm12, YMMWORD PTR [r12+928]
+ vmovdqu ymm11, YMMWORD PTR [r12+960]
+ vmovdqu ymm13, YMMWORD PTR [r12+992]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1024]
+ vmovdqu ymm12, YMMWORD PTR [r12+1056]
+ vmovdqu ymm11, YMMWORD PTR [r12+1088]
+ vmovdqu ymm13, YMMWORD PTR [r12+1120]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 0/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1152]
+ vmovdqu ymm12, YMMWORD PTR [r12+1184]
+ vmovdqu ymm11, YMMWORD PTR [r12+1216]
+ vmovdqu ymm13, YMMWORD PTR [r12+1248]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [r11], ymm8
+ vmovdqu YMMWORD PTR [r11+32], ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r11+64], ymm8
+ vmovdqu YMMWORD PTR [r11+96], ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [r11+128], ymm8
+ vmovdqu YMMWORD PTR [r11+160], ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r11+192], ymm8
+ vmovdqu YMMWORD PTR [r11+224], ymm9
+ vmovdqu ymm0, YMMWORD PTR [r11+256]
+ vmovdqu ymm1, YMMWORD PTR [r11+288]
+ vmovdqu ymm2, YMMWORD PTR [r11+320]
+ vmovdqu ymm3, YMMWORD PTR [r11+352]
+ vmovdqu ymm4, YMMWORD PTR [r11+384]
+ vmovdqu ymm5, YMMWORD PTR [r11+416]
+ vmovdqu ymm6, YMMWORD PTR [r11+448]
+ vmovdqu ymm7, YMMWORD PTR [r11+480]
+ ; 64: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1280]
+ vmovdqu ymm12, YMMWORD PTR [r12+1312]
+ vpmullw ymm8, ymm4, ymm12
+ vpmullw ymm9, ymm5, ymm12
+ vpmulhw ymm4, ymm4, ymm10
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vpsubw ymm4, ymm0, ymm8
+ vpsubw ymm5, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm2, ymm8
+ vpsubw ymm7, ymm3, ymm9
+ vpaddw ymm2, ymm2, ymm8
+ vpaddw ymm3, ymm3, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1344]
+ vmovdqu ymm12, YMMWORD PTR [r12+1376]
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm2, ymm0, ymm8
+ vpsubw ymm3, ymm1, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ ; 32: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1408]
+ vmovdqu ymm12, YMMWORD PTR [r12+1440]
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm6, ymm4, ymm8
+ vpsubw ymm7, ymm5, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1472]
+ vmovdqu ymm12, YMMWORD PTR [r12+1504]
+ vmovdqu ymm11, YMMWORD PTR [r12+1536]
+ vmovdqu ymm13, YMMWORD PTR [r12+1568]
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 16: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1600]
+ vmovdqu ymm12, YMMWORD PTR [r12+1632]
+ vmovdqu ymm11, YMMWORD PTR [r12+1664]
+ vmovdqu ymm13, YMMWORD PTR [r12+1696]
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+1728]
+ vperm2i128 ymm1, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+1760]
+ vperm2i128 ymm9, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+1792]
+ vperm2i128 ymm3, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+1824]
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm0, ymm1, ymm0
+ vpsubw ymm2, ymm3, ymm2
+ vpsubw ymm1, ymm8, ymm0
+ vpsubw ymm3, ymm9, ymm2
+ vpaddw ymm8, ymm8, ymm0
+ vpaddw ymm9, ymm9, ymm2
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+1856]
+ vmovdqu ymm12, YMMWORD PTR [r12+1888]
+ vmovdqu ymm11, YMMWORD PTR [r12+1920]
+ vmovdqu ymm13, YMMWORD PTR [r12+1952]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 8: 1/3
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+1984]
+ vperm2i128 ymm5, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+2016]
+ vperm2i128 ymm9, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+2048]
+ vperm2i128 ymm7, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+2080]
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm4, ymm5, ymm4
+ vpsubw ymm6, ymm7, ymm6
+ vpsubw ymm5, ymm8, ymm4
+ vpsubw ymm7, ymm9, ymm6
+ vpaddw ymm8, ymm8, ymm4
+ vpaddw ymm9, ymm9, ymm6
+ ; 4: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+2112]
+ vmovdqu ymm12, YMMWORD PTR [r12+2144]
+ vmovdqu ymm11, YMMWORD PTR [r12+2176]
+ vmovdqu ymm13, YMMWORD PTR [r12+2208]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+2240]
+ vmovdqu ymm12, YMMWORD PTR [r12+2272]
+ vmovdqu ymm11, YMMWORD PTR [r12+2304]
+ vmovdqu ymm13, YMMWORD PTR [r12+2336]
+ vpsllq ymm8, ymm1, 32
+ vpsrlq ymm9, ymm0, 32
+ vpblendd ymm0, ymm0, ymm8, 170
+ vpblendd ymm1, ymm1, ymm9, 85
+ vpsllq ymm8, ymm3, 32
+ vpsrlq ymm9, ymm2, 32
+ vpblendd ymm2, ymm2, ymm8, 170
+ vpblendd ymm3, ymm3, ymm9, 85
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm1, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vpsubw ymm1, ymm0, ymm8
+ vpsubw ymm3, ymm2, ymm9
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm2, ymm2, ymm9
+ ; 2: 1/3
+ vmovdqu ymm10, YMMWORD PTR [r12+2368]
+ vmovdqu ymm12, YMMWORD PTR [r12+2400]
+ vmovdqu ymm11, YMMWORD PTR [r12+2432]
+ vmovdqu ymm13, YMMWORD PTR [r12+2464]
+ vpsllq ymm8, ymm5, 32
+ vpsrlq ymm9, ymm4, 32
+ vpblendd ymm4, ymm4, ymm8, 170
+ vpblendd ymm5, ymm5, ymm9, 85
+ vpsllq ymm8, ymm7, 32
+ vpsrlq ymm9, ymm6, 32
+ vpblendd ymm6, ymm6, ymm8, 170
+ vpblendd ymm7, ymm7, ymm9, 85
+ vpmullw ymm8, ymm5, ymm12
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm5, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vpsubw ymm5, ymm4, ymm8
+ vpsubw ymm7, ymm6, ymm9
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm6, ymm6, ymm9
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckhdq ymm9, ymm0, ymm1
+ vperm2i128 ymm0, ymm8, ymm9, 32
+ vperm2i128 ymm1, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm2, ymm3
+ vpunpckhdq ymm9, ymm2, ymm3
+ vperm2i128 ymm2, ymm8, ymm9, 32
+ vperm2i128 ymm3, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm4, ymm5
+ vpunpckhdq ymm9, ymm4, ymm5
+ vperm2i128 ymm4, ymm8, ymm9, 32
+ vperm2i128 ymm5, ymm8, ymm9, 49
+ vpunpckldq ymm8, ymm6, ymm7
+ vpunpckhdq ymm9, ymm6, ymm7
+ vperm2i128 ymm6, ymm8, ymm9, 32
+ vperm2i128 ymm7, ymm8, ymm9, 49
+ vpmulhw ymm8, ymm0, ymm15
+ vpmulhw ymm9, ymm1, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm0, ymm8
+ vpsubw ymm9, ymm1, ymm9
+ vmovdqu YMMWORD PTR [r11+256], ymm8
+ vmovdqu YMMWORD PTR [r11+288], ymm9
+ vpmulhw ymm8, ymm2, ymm15
+ vpmulhw ymm9, ymm3, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm2, ymm8
+ vpsubw ymm9, ymm3, ymm9
+ vmovdqu YMMWORD PTR [r11+320], ymm8
+ vmovdqu YMMWORD PTR [r11+352], ymm9
+ vpmulhw ymm8, ymm4, ymm15
+ vpmulhw ymm9, ymm5, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm4, ymm8
+ vpsubw ymm9, ymm5, ymm9
+ vmovdqu YMMWORD PTR [r11+384], ymm8
+ vmovdqu YMMWORD PTR [r11+416], ymm9
+ vpmulhw ymm8, ymm6, ymm15
+ vpmulhw ymm9, ymm7, ymm15
+ vpsraw ymm8, ymm8, 10
+ vpsraw ymm9, ymm9, 10
+ vpmullw ymm8, ymm8, ymm14
+ vpmullw ymm9, ymm9, ymm14
+ vpsubw ymm8, ymm6, ymm8
+ vpsubw ymm9, ymm7, ymm9
+ vmovdqu YMMWORD PTR [r11+448], ymm8
+ vmovdqu YMMWORD PTR [r11+480], ymm9
+ add r11, 512
+ sub r10, 1
+ jg L_mlkem_decapsulate_avx2_trans
+ vmovdqu ymm12, YMMWORD PTR mlkem_qinv
+ ; Pointwise acc mont
+ movsxd r10, eax
+ ; Base mul mont
+ mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8]
+ vmovdqu ymm5, YMMWORD PTR [r8+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12]
+ vmovdqu ymm11, YMMWORD PTR [r12+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+64]
+ vmovdqu ymm5, YMMWORD PTR [r8+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+64]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm5, YMMWORD PTR [r8+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+128]
+ vmovdqu ymm11, YMMWORD PTR [r12+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+192]
+ vmovdqu ymm5, YMMWORD PTR [r8+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+192]
+ vmovdqu ymm11, YMMWORD PTR [r12+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm5, YMMWORD PTR [r8+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+256]
+ vmovdqu ymm11, YMMWORD PTR [r12+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+320]
+ vmovdqu ymm5, YMMWORD PTR [r8+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+320]
+ vmovdqu ymm11, YMMWORD PTR [r12+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+384]
+ vmovdqu ymm11, YMMWORD PTR [r12+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+448]
+ vmovdqu ymm5, YMMWORD PTR [r8+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+448]
+ vmovdqu ymm11, YMMWORD PTR [r12+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add rcx, 512
+ add r8, 512
+ sub r10, 2
+ jz L_pointwise_acc_mont_end_decap
+L_pointwise_acc_mont_start_decap:
+ ; Base mul mont add
+ mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8]
+ vmovdqu ymm5, YMMWORD PTR [r8+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12]
+ vmovdqu ymm11, YMMWORD PTR [r12+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx]
+ vmovdqu ymm7, YMMWORD PTR [rdx+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+64]
+ vmovdqu ymm5, YMMWORD PTR [r8+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+64]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+64]
+ vmovdqu ymm7, YMMWORD PTR [rdx+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm5, YMMWORD PTR [r8+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+128]
+ vmovdqu ymm11, YMMWORD PTR [r12+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+128]
+ vmovdqu ymm7, YMMWORD PTR [rdx+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+192]
+ vmovdqu ymm5, YMMWORD PTR [r8+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+192]
+ vmovdqu ymm11, YMMWORD PTR [r12+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm5, YMMWORD PTR [r8+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+256]
+ vmovdqu ymm11, YMMWORD PTR [r12+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+256]
+ vmovdqu ymm7, YMMWORD PTR [rdx+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+320]
+ vmovdqu ymm5, YMMWORD PTR [r8+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+320]
+ vmovdqu ymm11, YMMWORD PTR [r12+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+384]
+ vmovdqu ymm11, YMMWORD PTR [r12+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+384]
+ vmovdqu ymm7, YMMWORD PTR [rdx+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+448]
+ vmovdqu ymm5, YMMWORD PTR [r8+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+448]
+ vmovdqu ymm11, YMMWORD PTR [r12+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add rcx, 512
+ add r8, 512
+ sub r10, 1
+ jg L_pointwise_acc_mont_start_decap
+L_pointwise_acc_mont_end_decap:
+ ; Base mul mont add
+ mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul]
+ vmovdqu ymm2, YMMWORD PTR [rcx]
+ vmovdqu ymm3, YMMWORD PTR [rcx+32]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8]
+ vmovdqu ymm5, YMMWORD PTR [r8+32]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12]
+ vmovdqu ymm11, YMMWORD PTR [r12+32]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx]
+ vmovdqu ymm7, YMMWORD PTR [rdx+32]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+64]
+ vmovdqu ymm5, YMMWORD PTR [r8+96]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+64]
+ vmovdqu ymm11, YMMWORD PTR [r12+96]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+64]
+ vmovdqu ymm7, YMMWORD PTR [rdx+96]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+128]
+ vmovdqu ymm3, YMMWORD PTR [rcx+160]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+128]
+ vmovdqu ymm5, YMMWORD PTR [r8+160]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+128]
+ vmovdqu ymm11, YMMWORD PTR [r12+160]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+128]
+ vmovdqu ymm7, YMMWORD PTR [rdx+160]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+192]
+ vmovdqu ymm3, YMMWORD PTR [rcx+224]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+192]
+ vmovdqu ymm5, YMMWORD PTR [r8+224]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+192]
+ vmovdqu ymm11, YMMWORD PTR [r12+224]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+192], ymm0
+ vmovdqu YMMWORD PTR [rdx+224], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+256]
+ vmovdqu ymm3, YMMWORD PTR [rcx+288]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+256]
+ vmovdqu ymm5, YMMWORD PTR [r8+288]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+256]
+ vmovdqu ymm11, YMMWORD PTR [r12+288]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+256]
+ vmovdqu ymm7, YMMWORD PTR [rdx+288]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+320]
+ vmovdqu ymm5, YMMWORD PTR [r8+352]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+320]
+ vmovdqu ymm11, YMMWORD PTR [r12+352]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+320], ymm0
+ vmovdqu YMMWORD PTR [rdx+352], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+384]
+ vmovdqu ymm3, YMMWORD PTR [rcx+416]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+384]
+ vmovdqu ymm5, YMMWORD PTR [r8+416]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+384]
+ vmovdqu ymm11, YMMWORD PTR [r12+416]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+384]
+ vmovdqu ymm7, YMMWORD PTR [rdx+416]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vpslld ymm6, ymm3, 16
+ vpsrld ymm7, ymm2, 16
+ vpblendw ymm2, ymm2, ymm6, 170
+ vpblendw ymm3, ymm3, ymm7, 85
+ vmovdqu ymm4, YMMWORD PTR [r8+448]
+ vmovdqu ymm5, YMMWORD PTR [r8+480]
+ vpslld ymm6, ymm5, 16
+ vpsrld ymm7, ymm4, 16
+ vpblendw ymm4, ymm4, ymm6, 170
+ vpblendw ymm5, ymm5, ymm7, 85
+ vmovdqu ymm10, YMMWORD PTR [r12+448]
+ vmovdqu ymm11, YMMWORD PTR [r12+480]
+ vpmullw ymm0, ymm3, ymm5
+ vpmulhw ymm6, ymm3, ymm5
+ vpmullw ymm1, ymm2, ymm4
+ vpmulhw ymm7, ymm2, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm0, ymm12
+ vpmullw ymm9, ymm1, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm6, ymm8
+ vpsubw ymm1, ymm7, ymm9
+ vpmullw ymm6, ymm0, ymm11
+ vpmulhw ymm7, ymm0, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm0, ymm7, ymm6
+ vpaddw ymm0, ymm0, ymm1
+ vpmullw ymm1, ymm2, ymm5
+ vpmulhw ymm6, ymm2, ymm5
+ vpmullw ymm2, ymm3, ymm4
+ vpmulhw ymm7, ymm3, ymm4
+ ; Mont Reduce
+ vpmullw ymm8, ymm1, ymm12
+ vpmullw ymm9, ymm2, ymm12
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm1, ymm6, ymm8
+ vpsubw ymm2, ymm7, ymm9
+ vpaddw ymm1, ymm1, ymm2
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ vpaddw ymm0, ymm0, ymm6
+ vpaddw ymm1, ymm1, ymm7
+ vpslld ymm6, ymm1, 16
+ vpsrld ymm7, ymm0, 16
+ vpblendw ymm0, ymm0, ymm6, 170
+ vpblendw ymm1, ymm1, ymm7, 85
+ vmovdqu YMMWORD PTR [rdx+448], ymm0
+ vmovdqu YMMWORD PTR [rdx+480], ymm1
+ add rcx, 512
+ ; invntt
+ mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv]
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+128]
+ vmovdqu ymm5, YMMWORD PTR [rdx+160]
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+32]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+64]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+96]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+128]
+ vmovdqu ymm12, YMMWORD PTR [r12+160]
+ vmovdqu ymm11, YMMWORD PTR [r12+192]
+ vmovdqu ymm13, YMMWORD PTR [r12+224]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+256]
+ vmovdqu ymm12, YMMWORD PTR [r12+288]
+ vmovdqu ymm11, YMMWORD PTR [r12+320]
+ vmovdqu ymm13, YMMWORD PTR [r12+352]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 1/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+384]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+416]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+448]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+480]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+512]
+ vmovdqu ymm12, YMMWORD PTR [r12+544]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 1/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+576]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+608]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+640]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+672]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+704]
+ vmovdqu ymm12, YMMWORD PTR [r12+736]
+ vmovdqu ymm11, YMMWORD PTR [r12+768]
+ vmovdqu ymm13, YMMWORD PTR [r12+800]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+832]
+ vmovdqu ymm12, YMMWORD PTR [r12+864]
+ vmovdqu ymm11, YMMWORD PTR [r12+896]
+ vmovdqu ymm13, YMMWORD PTR [r12+928]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 1/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+960]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+992]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+1024]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+1056]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1088]
+ vmovdqu ymm12, YMMWORD PTR [r12+1120]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 1/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1152]
+ vmovdqu ymm12, YMMWORD PTR [r12+1184]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ vmovdqu YMMWORD PTR [rdx+160], ymm5
+ vmovdqu YMMWORD PTR [rdx+192], ymm6
+ vmovdqu YMMWORD PTR [rdx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [rdx+384]
+ vmovdqu ymm5, YMMWORD PTR [rdx+416]
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm0, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+1216]
+ vperm2i128 ymm9, ymm0, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+1248]
+ vpsllq ymm0, ymm9, 32
+ vpsrlq ymm1, ymm8, 32
+ vpblendd ymm0, ymm8, ymm0, 170
+ vpblendd ymm1, ymm9, ymm1, 85
+ vperm2i128 ymm8, ymm2, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+1280]
+ vperm2i128 ymm9, ymm2, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+1312]
+ vpsllq ymm2, ymm9, 32
+ vpsrlq ymm3, ymm8, 32
+ vpblendd ymm2, ymm8, ymm2, 170
+ vpblendd ymm3, ymm9, ymm3, 85
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1344]
+ vmovdqu ymm12, YMMWORD PTR [r12+1376]
+ vmovdqu ymm11, YMMWORD PTR [r12+1408]
+ vmovdqu ymm13, YMMWORD PTR [r12+1440]
+ vpunpckldq ymm0, ymm8, ymm1
+ vpunpckhdq ymm1, ymm8, ymm1
+ vpunpckldq ymm2, ymm9, ymm3
+ vpunpckhdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1472]
+ vmovdqu ymm12, YMMWORD PTR [r12+1504]
+ vmovdqu ymm11, YMMWORD PTR [r12+1536]
+ vmovdqu ymm13, YMMWORD PTR [r12+1568]
+ vpunpcklqdq ymm0, ymm8, ymm1
+ vpunpckhqdq ymm1, ymm8, ymm1
+ vpunpcklqdq ymm2, ymm9, ymm3
+ vpunpckhqdq ymm3, ymm9, ymm3
+ vpaddw ymm8, ymm0, ymm1
+ vpaddw ymm9, ymm2, ymm3
+ vpsubw ymm1, ymm0, ymm1
+ vpsubw ymm3, ymm2, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm2, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm2, ymm2, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm2, ymm2, ymm14
+ vpsubw ymm8, ymm8, ymm0
+ vpsubw ymm9, ymm9, ymm2
+ vpmullw ymm0, ymm1, ymm12
+ vpmullw ymm2, ymm3, ymm13
+ vpmulhw ymm1, ymm1, ymm10
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm0, ymm0, ymm14
+ vpmulhw ymm2, ymm2, ymm14
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm3, ymm3, ymm2
+ ; 16: 2/2
+ vperm2i128 ymm0, ymm8, ymm1, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+1600]
+ vperm2i128 ymm1, ymm8, ymm1, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+1632]
+ vperm2i128 ymm2, ymm9, ymm3, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+1664]
+ vperm2i128 ymm3, ymm9, ymm3, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+1696]
+ vpsubw ymm8, ymm0, ymm1
+ vpsubw ymm9, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm1
+ vpaddw ymm2, ymm2, ymm3
+ vpmullw ymm1, ymm8, ymm12
+ vpmullw ymm3, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm1, ymm1, ymm14
+ vpmulhw ymm3, ymm3, ymm14
+ vpsubw ymm1, ymm8, ymm1
+ vpsubw ymm3, ymm9, ymm3
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1728]
+ vmovdqu ymm12, YMMWORD PTR [r12+1760]
+ vpaddw ymm8, ymm0, ymm2
+ vpaddw ymm9, ymm1, ymm3
+ vpsubw ymm2, ymm0, ymm2
+ vpsubw ymm3, ymm1, ymm3
+ vpmulhw ymm0, ymm8, ymm15
+ vpmulhw ymm1, ymm9, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm8, ymm0
+ vpsubw ymm1, ymm9, ymm1
+ vpmullw ymm8, ymm2, ymm12
+ vpmullw ymm9, ymm3, ymm12
+ vpmulhw ymm2, ymm2, ymm10
+ vpmulhw ymm3, ymm3, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ ; 2: 2/2
+ vperm2i128 ymm8, ymm4, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+1792]
+ vperm2i128 ymm9, ymm4, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+1824]
+ vpsllq ymm4, ymm9, 32
+ vpsrlq ymm5, ymm8, 32
+ vpblendd ymm4, ymm8, ymm4, 170
+ vpblendd ymm5, ymm9, ymm5, 85
+ vperm2i128 ymm8, ymm6, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+1856]
+ vperm2i128 ymm9, ymm6, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+1888]
+ vpsllq ymm6, ymm9, 32
+ vpsrlq ymm7, ymm8, 32
+ vpblendd ymm6, ymm8, ymm6, 170
+ vpblendd ymm7, ymm9, ymm7, 85
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 4: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+1920]
+ vmovdqu ymm12, YMMWORD PTR [r12+1952]
+ vmovdqu ymm11, YMMWORD PTR [r12+1984]
+ vmovdqu ymm13, YMMWORD PTR [r12+2016]
+ vpunpckldq ymm4, ymm8, ymm5
+ vpunpckhdq ymm5, ymm8, ymm5
+ vpunpckldq ymm6, ymm9, ymm7
+ vpunpckhdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 8: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+2048]
+ vmovdqu ymm12, YMMWORD PTR [r12+2080]
+ vmovdqu ymm11, YMMWORD PTR [r12+2112]
+ vmovdqu ymm13, YMMWORD PTR [r12+2144]
+ vpunpcklqdq ymm4, ymm8, ymm5
+ vpunpckhqdq ymm5, ymm8, ymm5
+ vpunpcklqdq ymm6, ymm9, ymm7
+ vpunpckhqdq ymm7, ymm9, ymm7
+ vpaddw ymm8, ymm4, ymm5
+ vpaddw ymm9, ymm6, ymm7
+ vpsubw ymm5, ymm4, ymm5
+ vpsubw ymm7, ymm6, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm6, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm6, ymm6, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm6, ymm6, ymm14
+ vpsubw ymm8, ymm8, ymm4
+ vpsubw ymm9, ymm9, ymm6
+ vpmullw ymm4, ymm5, ymm12
+ vpmullw ymm6, ymm7, ymm13
+ vpmulhw ymm5, ymm5, ymm10
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm6, ymm6, ymm14
+ vpsubw ymm5, ymm5, ymm4
+ vpsubw ymm7, ymm7, ymm6
+ ; 16: 2/2
+ vperm2i128 ymm4, ymm8, ymm5, 32
+ vmovdqu ymm10, YMMWORD PTR [r12+2176]
+ vperm2i128 ymm5, ymm8, ymm5, 49
+ vmovdqu ymm12, YMMWORD PTR [r12+2208]
+ vperm2i128 ymm6, ymm9, ymm7, 32
+ vmovdqu ymm11, YMMWORD PTR [r12+2240]
+ vperm2i128 ymm7, ymm9, ymm7, 49
+ vmovdqu ymm13, YMMWORD PTR [r12+2272]
+ vpsubw ymm8, ymm4, ymm5
+ vpsubw ymm9, ymm6, ymm7
+ vpaddw ymm4, ymm4, ymm5
+ vpaddw ymm6, ymm6, ymm7
+ vpmullw ymm5, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm13
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm11
+ vpmulhw ymm5, ymm5, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm5, ymm8, ymm5
+ vpsubw ymm7, ymm9, ymm7
+ ; 32: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+2304]
+ vmovdqu ymm12, YMMWORD PTR [r12+2336]
+ vpaddw ymm8, ymm4, ymm6
+ vpaddw ymm9, ymm5, ymm7
+ vpsubw ymm6, ymm4, ymm6
+ vpsubw ymm7, ymm5, ymm7
+ vpmulhw ymm4, ymm8, ymm15
+ vpmulhw ymm5, ymm9, ymm15
+ vpsraw ymm4, ymm4, 10
+ vpsraw ymm5, ymm5, 10
+ vpmullw ymm4, ymm4, ymm14
+ vpmullw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ ; 64: 2/2
+ vmovdqu ymm10, YMMWORD PTR [r12+2368]
+ vmovdqu ymm12, YMMWORD PTR [r12+2400]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpsubw ymm8, ymm2, ymm6
+ vpsubw ymm9, ymm3, ymm7
+ vpaddw ymm2, ymm2, ymm6
+ vpaddw ymm3, ymm3, ymm7
+ vpmullw ymm6, ymm8, ymm12
+ vpmullw ymm7, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm6, ymm6, ymm14
+ vpmulhw ymm7, ymm7, ymm14
+ vpsubw ymm6, ymm8, ymm6
+ vpsubw ymm7, ymm9, ymm7
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm2
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ ; 128
+ vmovdqu ymm10, YMMWORD PTR [r12+2432]
+ vmovdqu ymm12, YMMWORD PTR [r12+2464]
+ vmovdqu ymm11, YMMWORD PTR [r12+2496]
+ vmovdqu ymm13, YMMWORD PTR [r12+2528]
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+192]
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu YMMWORD PTR [rdx+192], ymm2
+ vmovdqu YMMWORD PTR [rdx+224], ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm4
+ vmovdqu YMMWORD PTR [rdx+416], ymm5
+ vmovdqu YMMWORD PTR [rdx+448], ymm6
+ vmovdqu YMMWORD PTR [rdx+480], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+256]
+ vmovdqu ymm5, YMMWORD PTR [rdx+288]
+ vmovdqu ymm6, YMMWORD PTR [rdx+320]
+ vmovdqu ymm7, YMMWORD PTR [rdx+352]
+ vpsubw ymm8, ymm0, ymm4
+ vpsubw ymm9, ymm1, ymm5
+ vpaddw ymm0, ymm0, ymm4
+ vpaddw ymm1, ymm1, ymm5
+ vpmullw ymm4, ymm8, ymm12
+ vpmullw ymm5, ymm9, ymm12
+ vpmulhw ymm8, ymm8, ymm10
+ vpmulhw ymm9, ymm9, ymm10
+ vpmulhw ymm4, ymm4, ymm14
+ vpmulhw ymm5, ymm5, ymm14
+ vpsubw ymm4, ymm8, ymm4
+ vpsubw ymm5, ymm9, ymm5
+ vpaddw ymm8, ymm2, ymm6
+ vpaddw ymm9, ymm3, ymm7
+ vpsubw ymm6, ymm2, ymm6
+ vpsubw ymm7, ymm3, ymm7
+ vpmulhw ymm2, ymm8, ymm15
+ vpmulhw ymm3, ymm9, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm8, ymm2
+ vpsubw ymm3, ymm9, ymm3
+ vpmullw ymm8, ymm6, ymm12
+ vpmullw ymm9, ymm7, ymm12
+ vpmulhw ymm6, ymm6, ymm10
+ vpmulhw ymm7, ymm7, ymm10
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vpmullw ymm8, ymm0, ymm13
+ vpmullw ymm9, ymm1, ymm13
+ vpmulhw ymm0, ymm0, ymm11
+ vpmulhw ymm1, ymm1, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm0, ymm0, ymm8
+ vpsubw ymm1, ymm1, ymm9
+ vpmullw ymm8, ymm2, ymm13
+ vpmullw ymm9, ymm3, ymm13
+ vpmulhw ymm2, ymm2, ymm11
+ vpmulhw ymm3, ymm3, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm2, ymm2, ymm8
+ vpsubw ymm3, ymm3, ymm9
+ vpmullw ymm8, ymm4, ymm13
+ vpmullw ymm9, ymm5, ymm13
+ vpmulhw ymm4, ymm4, ymm11
+ vpmulhw ymm5, ymm5, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm4, ymm4, ymm8
+ vpsubw ymm5, ymm5, ymm9
+ vpmullw ymm8, ymm6, ymm13
+ vpmullw ymm9, ymm7, ymm13
+ vpmulhw ymm6, ymm6, ymm11
+ vpmulhw ymm7, ymm7, ymm11
+ vpmulhw ymm8, ymm8, ymm14
+ vpmulhw ymm9, ymm9, ymm14
+ vpsubw ymm6, ymm6, ymm8
+ vpsubw ymm7, ymm7, ymm9
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm4
+ vmovdqu YMMWORD PTR [rdx+288], ymm5
+ vmovdqu YMMWORD PTR [rdx+320], ymm6
+ vmovdqu YMMWORD PTR [rdx+352], ymm7
+ ; Sub Errors
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [r9]
+ vmovdqu ymm5, YMMWORD PTR [r9+32]
+ vmovdqu ymm6, YMMWORD PTR [r9+64]
+ vmovdqu ymm7, YMMWORD PTR [r9+96]
+ vpsubw ymm4, ymm4, ymm0
+ vpsubw ymm5, ymm5, ymm1
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpsubw ymm6, ymm6, ymm2
+ vpsubw ymm7, ymm7, ymm3
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vmovdqu ymm2, YMMWORD PTR [rdx+192]
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
+ vmovdqu ymm4, YMMWORD PTR [r9+128]
+ vmovdqu ymm5, YMMWORD PTR [r9+160]
+ vmovdqu ymm6, YMMWORD PTR [r9+192]
+ vmovdqu ymm7, YMMWORD PTR [r9+224]
+ vpsubw ymm4, ymm4, ymm0
+ vpsubw ymm5, ymm5, ymm1
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpsubw ymm6, ymm6, ymm2
+ vpsubw ymm7, ymm7, ymm3
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+128], ymm0
+ vmovdqu YMMWORD PTR [rdx+160], ymm1
+ vmovdqu YMMWORD PTR [rdx+192], ymm2
+ vmovdqu YMMWORD PTR [rdx+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [r9+256]
+ vmovdqu ymm5, YMMWORD PTR [r9+288]
+ vmovdqu ymm6, YMMWORD PTR [r9+320]
+ vmovdqu ymm7, YMMWORD PTR [r9+352]
+ vpsubw ymm4, ymm4, ymm0
+ vpsubw ymm5, ymm5, ymm1
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpsubw ymm6, ymm6, ymm2
+ vpsubw ymm7, ymm7, ymm3
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+256], ymm0
+ vmovdqu YMMWORD PTR [rdx+288], ymm1
+ vmovdqu YMMWORD PTR [rdx+320], ymm2
+ vmovdqu YMMWORD PTR [rdx+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm1, YMMWORD PTR [rdx+416]
+ vmovdqu ymm2, YMMWORD PTR [rdx+448]
+ vmovdqu ymm3, YMMWORD PTR [rdx+480]
+ vmovdqu ymm4, YMMWORD PTR [r9+384]
+ vmovdqu ymm5, YMMWORD PTR [r9+416]
+ vmovdqu ymm6, YMMWORD PTR [r9+448]
+ vmovdqu ymm7, YMMWORD PTR [r9+480]
+ vpsubw ymm4, ymm4, ymm0
+ vpsubw ymm5, ymm5, ymm1
+ vpmulhw ymm0, ymm4, ymm15
+ vpmulhw ymm1, ymm5, ymm15
+ vpsraw ymm0, ymm0, 10
+ vpsraw ymm1, ymm1, 10
+ vpmullw ymm0, ymm0, ymm14
+ vpmullw ymm1, ymm1, ymm14
+ vpsubw ymm0, ymm4, ymm0
+ vpsubw ymm1, ymm5, ymm1
+ vpsubw ymm6, ymm6, ymm2
+ vpsubw ymm7, ymm7, ymm3
+ vpmulhw ymm2, ymm6, ymm15
+ vpmulhw ymm3, ymm7, ymm15
+ vpsraw ymm2, ymm2, 10
+ vpsraw ymm3, ymm3, 10
+ vpmullw ymm2, ymm2, ymm14
+ vpmullw ymm3, ymm3, ymm14
+ vpsubw ymm2, ymm6, ymm2
+ vpsubw ymm3, ymm7, ymm3
+ vmovdqu YMMWORD PTR [rdx+384], ymm0
+ vmovdqu YMMWORD PTR [rdx+416], ymm1
+ vmovdqu YMMWORD PTR [rdx+448], ymm2
+ vmovdqu YMMWORD PTR [rdx+480], ymm3
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r12
+ ret
+mlkem_decapsulate_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_csubq_avx2 PROC
+ sub rsp, 112
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu ymm12, YMMWORD PTR mlkem_q
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpsubw ymm8, ymm0, ymm12
+ vpsubw ymm9, ymm1, ymm12
+ vpsubw ymm10, ymm2, ymm12
+ vpsubw ymm11, ymm3, ymm12
+ vpsraw ymm0, ymm8, 15
+ vpsraw ymm1, ymm9, 15
+ vpsraw ymm2, ymm10, 15
+ vpsraw ymm3, ymm11, 15
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpand ymm2, ymm2, ymm12
+ vpand ymm3, ymm3, ymm12
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm10
+ vpaddw ymm3, ymm3, ymm11
+ vpsubw ymm8, ymm4, ymm12
+ vpsubw ymm9, ymm5, ymm12
+ vpsubw ymm10, ymm6, ymm12
+ vpsubw ymm11, ymm7, ymm12
+ vpsraw ymm4, ymm8, 15
+ vpsraw ymm5, ymm9, 15
+ vpsraw ymm6, ymm10, 15
+ vpsraw ymm7, ymm11, 15
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vpand ymm6, ymm6, ymm12
+ vpand ymm7, ymm7, ymm12
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpaddw ymm6, ymm6, ymm10
+ vpaddw ymm7, ymm7, ymm11
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpsubw ymm8, ymm0, ymm12
+ vpsubw ymm9, ymm1, ymm12
+ vpsubw ymm10, ymm2, ymm12
+ vpsubw ymm11, ymm3, ymm12
+ vpsraw ymm0, ymm8, 15
+ vpsraw ymm1, ymm9, 15
+ vpsraw ymm2, ymm10, 15
+ vpsraw ymm3, ymm11, 15
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpand ymm2, ymm2, ymm12
+ vpand ymm3, ymm3, ymm12
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm10
+ vpaddw ymm3, ymm3, ymm11
+ vpsubw ymm8, ymm4, ymm12
+ vpsubw ymm9, ymm5, ymm12
+ vpsubw ymm10, ymm6, ymm12
+ vpsubw ymm11, ymm7, ymm12
+ vpsraw ymm4, ymm8, 15
+ vpsraw ymm5, ymm9, 15
+ vpsraw ymm6, ymm10, 15
+ vpsraw ymm7, ymm11, 15
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vpand ymm6, ymm6, ymm12
+ vpand ymm7, ymm7, ymm12
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpaddw ymm6, ymm6, ymm10
+ vpaddw ymm7, ymm7, ymm11
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ add rsp, 112
+ ret
+mlkem_csubq_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_rej_idx QWORD 0ffffffffffffffffh, 0ffffffffffffff00h
+ QWORD 0ffffffffffffff02h, 0ffffffffffff0200h
+ QWORD 0ffffffffffffff04h, 0ffffffffffff0400h
+ QWORD 0ffffffffffff0402h, 0ffffffffff040200h
+ QWORD 0ffffffffffffff06h, 0ffffffffffff0600h
+ QWORD 0ffffffffffff0602h, 0ffffffffff060200h
+ QWORD 0ffffffffffff0604h, 0ffffffffff060400h
+ QWORD 0ffffffffff060402h, 0ffffffff06040200h
+ QWORD 0ffffffffffffff08h, 0ffffffffffff0800h
+ QWORD 0ffffffffffff0802h, 0ffffffffff080200h
+ QWORD 0ffffffffffff0804h, 0ffffffffff080400h
+ QWORD 0ffffffffff080402h, 0ffffffff08040200h
+ QWORD 0ffffffffffff0806h, 0ffffffffff080600h
+ QWORD 0ffffffffff080602h, 0ffffffff08060200h
+ QWORD 0ffffffffff080604h, 0ffffffff08060400h
+ QWORD 0ffffffff08060402h, 0ffffff0806040200h
+ QWORD 0ffffffffffffff0ah, 0ffffffffffff0a00h
+ QWORD 0ffffffffffff0a02h, 0ffffffffff0a0200h
+ QWORD 0ffffffffffff0a04h, 0ffffffffff0a0400h
+ QWORD 0ffffffffff0a0402h, 0ffffffff0a040200h
+ QWORD 0ffffffffffff0a06h, 0ffffffffff0a0600h
+ QWORD 0ffffffffff0a0602h, 0ffffffff0a060200h
+ QWORD 0ffffffffff0a0604h, 0ffffffff0a060400h
+ QWORD 0ffffffff0a060402h, 0ffffff0a06040200h
+ QWORD 0ffffffffffff0a08h, 0ffffffffff0a0800h
+ QWORD 0ffffffffff0a0802h, 0ffffffff0a080200h
+ QWORD 0ffffffffff0a0804h, 0ffffffff0a080400h
+ QWORD 0ffffffff0a080402h, 0ffffff0a08040200h
+ QWORD 0ffffffffff0a0806h, 0ffffffff0a080600h
+ QWORD 0ffffffff0a080602h, 0ffffff0a08060200h
+ QWORD 0ffffffff0a080604h, 0ffffff0a08060400h
+ QWORD 0ffffff0a08060402h, 0ffff0a0806040200h
+ QWORD 0ffffffffffffff0ch, 0ffffffffffff0c00h
+ QWORD 0ffffffffffff0c02h, 0ffffffffff0c0200h
+ QWORD 0ffffffffffff0c04h, 0ffffffffff0c0400h
+ QWORD 0ffffffffff0c0402h, 0ffffffff0c040200h
+ QWORD 0ffffffffffff0c06h, 0ffffffffff0c0600h
+ QWORD 0ffffffffff0c0602h, 0ffffffff0c060200h
+ QWORD 0ffffffffff0c0604h, 0ffffffff0c060400h
+ QWORD 0ffffffff0c060402h, 0ffffff0c06040200h
+ QWORD 0ffffffffffff0c08h, 0ffffffffff0c0800h
+ QWORD 0ffffffffff0c0802h, 0ffffffff0c080200h
+ QWORD 0ffffffffff0c0804h, 0ffffffff0c080400h
+ QWORD 0ffffffff0c080402h, 0ffffff0c08040200h
+ QWORD 0ffffffffff0c0806h, 0ffffffff0c080600h
+ QWORD 0ffffffff0c080602h, 0ffffff0c08060200h
+ QWORD 0ffffffff0c080604h, 0ffffff0c08060400h
+ QWORD 0ffffff0c08060402h, 0ffff0c0806040200h
+ QWORD 0ffffffffffff0c0ah, 0ffffffffff0c0a00h
+ QWORD 0ffffffffff0c0a02h, 0ffffffff0c0a0200h
+ QWORD 0ffffffffff0c0a04h, 0ffffffff0c0a0400h
+ QWORD 0ffffffff0c0a0402h, 0ffffff0c0a040200h
+ QWORD 0ffffffffff0c0a06h, 0ffffffff0c0a0600h
+ QWORD 0ffffffff0c0a0602h, 0ffffff0c0a060200h
+ QWORD 0ffffffff0c0a0604h, 0ffffff0c0a060400h
+ QWORD 0ffffff0c0a060402h, 0ffff0c0a06040200h
+ QWORD 0ffffffffff0c0a08h, 0ffffffff0c0a0800h
+ QWORD 0ffffffff0c0a0802h, 0ffffff0c0a080200h
+ QWORD 0ffffffff0c0a0804h, 0ffffff0c0a080400h
+ QWORD 0ffffff0c0a080402h, 0ffff0c0a08040200h
+ QWORD 0ffffffff0c0a0806h, 0ffffff0c0a080600h
+ QWORD 0ffffff0c0a080602h, 0ffff0c0a08060200h
+ QWORD 0ffffff0c0a080604h, 0ffff0c0a08060400h
+ QWORD 0ffff0c0a08060402h, 0ff0c0a0806040200h
+ QWORD 0ffffffffffffff0eh, 0ffffffffffff0e00h
+ QWORD 0ffffffffffff0e02h, 0ffffffffff0e0200h
+ QWORD 0ffffffffffff0e04h, 0ffffffffff0e0400h
+ QWORD 0ffffffffff0e0402h, 0ffffffff0e040200h
+ QWORD 0ffffffffffff0e06h, 0ffffffffff0e0600h
+ QWORD 0ffffffffff0e0602h, 0ffffffff0e060200h
+ QWORD 0ffffffffff0e0604h, 0ffffffff0e060400h
+ QWORD 0ffffffff0e060402h, 0ffffff0e06040200h
+ QWORD 0ffffffffffff0e08h, 0ffffffffff0e0800h
+ QWORD 0ffffffffff0e0802h, 0ffffffff0e080200h
+ QWORD 0ffffffffff0e0804h, 0ffffffff0e080400h
+ QWORD 0ffffffff0e080402h, 0ffffff0e08040200h
+ QWORD 0ffffffffff0e0806h, 0ffffffff0e080600h
+ QWORD 0ffffffff0e080602h, 0ffffff0e08060200h
+ QWORD 0ffffffff0e080604h, 0ffffff0e08060400h
+ QWORD 0ffffff0e08060402h, 0ffff0e0806040200h
+ QWORD 0ffffffffffff0e0ah, 0ffffffffff0e0a00h
+ QWORD 0ffffffffff0e0a02h, 0ffffffff0e0a0200h
+ QWORD 0ffffffffff0e0a04h, 0ffffffff0e0a0400h
+ QWORD 0ffffffff0e0a0402h, 0ffffff0e0a040200h
+ QWORD 0ffffffffff0e0a06h, 0ffffffff0e0a0600h
+ QWORD 0ffffffff0e0a0602h, 0ffffff0e0a060200h
+ QWORD 0ffffffff0e0a0604h, 0ffffff0e0a060400h
+ QWORD 0ffffff0e0a060402h, 0ffff0e0a06040200h
+ QWORD 0ffffffffff0e0a08h, 0ffffffff0e0a0800h
+ QWORD 0ffffffff0e0a0802h, 0ffffff0e0a080200h
+ QWORD 0ffffffff0e0a0804h, 0ffffff0e0a080400h
+ QWORD 0ffffff0e0a080402h, 0ffff0e0a08040200h
+ QWORD 0ffffffff0e0a0806h, 0ffffff0e0a080600h
+ QWORD 0ffffff0e0a080602h, 0ffff0e0a08060200h
+ QWORD 0ffffff0e0a080604h, 0ffff0e0a08060400h
+ QWORD 0ffff0e0a08060402h, 0ff0e0a0806040200h
+ QWORD 0ffffffffffff0e0ch, 0ffffffffff0e0c00h
+ QWORD 0ffffffffff0e0c02h, 0ffffffff0e0c0200h
+ QWORD 0ffffffffff0e0c04h, 0ffffffff0e0c0400h
+ QWORD 0ffffffff0e0c0402h, 0ffffff0e0c040200h
+ QWORD 0ffffffffff0e0c06h, 0ffffffff0e0c0600h
+ QWORD 0ffffffff0e0c0602h, 0ffffff0e0c060200h
+ QWORD 0ffffffff0e0c0604h, 0ffffff0e0c060400h
+ QWORD 0ffffff0e0c060402h, 0ffff0e0c06040200h
+ QWORD 0ffffffffff0e0c08h, 0ffffffff0e0c0800h
+ QWORD 0ffffffff0e0c0802h, 0ffffff0e0c080200h
+ QWORD 0ffffffff0e0c0804h, 0ffffff0e0c080400h
+ QWORD 0ffffff0e0c080402h, 0ffff0e0c08040200h
+ QWORD 0ffffffff0e0c0806h, 0ffffff0e0c080600h
+ QWORD 0ffffff0e0c080602h, 0ffff0e0c08060200h
+ QWORD 0ffffff0e0c080604h, 0ffff0e0c08060400h
+ QWORD 0ffff0e0c08060402h, 0ff0e0c0806040200h
+ QWORD 0ffffffffff0e0c0ah, 0ffffffff0e0c0a00h
+ QWORD 0ffffffff0e0c0a02h, 0ffffff0e0c0a0200h
+ QWORD 0ffffffff0e0c0a04h, 0ffffff0e0c0a0400h
+ QWORD 0ffffff0e0c0a0402h, 0ffff0e0c0a040200h
+ QWORD 0ffffffff0e0c0a06h, 0ffffff0e0c0a0600h
+ QWORD 0ffffff0e0c0a0602h, 0ffff0e0c0a060200h
+ QWORD 0ffffff0e0c0a0604h, 0ffff0e0c0a060400h
+ QWORD 0ffff0e0c0a060402h, 0ff0e0c0a06040200h
+ QWORD 0ffffffff0e0c0a08h, 0ffffff0e0c0a0800h
+ QWORD 0ffffff0e0c0a0802h, 0ffff0e0c0a080200h
+ QWORD 0ffffff0e0c0a0804h, 0ffff0e0c0a080400h
+ QWORD 0ffff0e0c0a080402h, 0ff0e0c0a08040200h
+ QWORD 0ffffff0e0c0a0806h, 0ffff0e0c0a080600h
+ QWORD 0ffff0e0c0a080602h, 0ff0e0c0a08060200h
+ QWORD 0ffff0e0c0a080604h, 0ff0e0c0a08060400h
+ QWORD 0ff0e0c0a08060402h, 0e0c0a0806040200h
+ptr_L_mlkem_rej_idx QWORD L_mlkem_rej_idx
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_rej_q QWORD 0d010d010d010d01h, 0d010d010d010d01h
+ QWORD 0d010d010d010d01h, 0d010d010d010d01h
+ptr_L_mlkem_rej_q QWORD L_mlkem_rej_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_rej_ones QWORD 0101010101010101h, 0101010101010101h
+ QWORD 0101010101010101h, 0101010101010101h
+ptr_L_mlkem_rej_ones QWORD L_mlkem_rej_ones
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_rej_mask QWORD 0fff0fff0fff0fffh, 0fff0fff0fff0fffh
+ QWORD 0fff0fff0fff0fffh, 0fff0fff0fff0fffh
+ptr_L_mlkem_rej_mask QWORD L_mlkem_rej_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_rej_shuffle QWORD 0504040302010100h, 0b0a0a0908070706h
+ QWORD 0908080706050504h, 0f0e0e0d0c0b0b0ah
+ptr_L_mlkem_rej_shuffle QWORD L_mlkem_rej_shuffle
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_rej_uniform_n_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov r10, rcx
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ mov eax, edx
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_rej_q
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_rej_ones
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_rej_mask
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_rej_shuffle
+ mov r11, QWORD PTR [ptr_L_mlkem_rej_idx]
+ mov rdi, 1229782938247303441
+ mov rbp, 1012195045828461056
+ mov r15, 72340172838076673
+ vpermq ymm0, [r8], 148
+ vpermq ymm1, [r8+24], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+48], 148
+ vpermq ymm1, [r8+72], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+96], 148
+ vpermq ymm1, [r8+120], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+144], 148
+ vpermq ymm1, [r8+168], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+192], 148
+ vpermq ymm1, [r8+216], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+240], 148
+ vpermq ymm1, [r8+264], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ vpermq ymm0, [r8+288], 148
+ vpermq ymm1, [r8+312], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ add r8, 336
+ sub r9d, 336
+L_mlkem_rej_uniform_n_avx2_start_256:
+ vpermq ymm0, [r8], 148
+ vpermq ymm1, [r8+24], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ add r8, 48
+ sub r9d, 48
+ cmp r9d, 48
+ jl L_mlkem_rej_uniform_n_avx2_done_256
+ cmp edx, 32
+ jge L_mlkem_rej_uniform_n_avx2_start_256
+L_mlkem_rej_uniform_n_avx2_done_256:
+ cmp edx, 8
+ jl L_mlkem_rej_uniform_n_avx2_done_128
+ cmp r9d, 12
+ jl L_mlkem_rej_uniform_n_avx2_done_128
+L_mlkem_rej_uniform_n_avx2_start_128:
+ vmovdqu xmm0, OWORD PTR [r8]
+ vpshufb xmm0, xmm0, xmm9
+ vpsrlw xmm2, xmm0, 4
+ vpblendw xmm0, xmm0, xmm2, 170
+ vpand xmm0, xmm0, xmm8
+ vpcmpgtw xmm2, xmm6, xmm0
+ vpmovmskb rbx, xmm2
+ mov r12, 21845
+ pext ebx, ebx, r12d
+ movq xmm3, QWORD PTR [r11+8*rbx]
+ vpaddb xmm4, xmm3, xmm7
+ vpunpcklbw xmm3, xmm3, xmm4
+ vpshufb xmm0, xmm0, xmm3
+ vmovdqu OWORD PTR [r10], xmm0
+ popcnt ecx, ebx
+ lea r10, QWORD PTR [r10+2*rcx]
+ sub edx, ecx
+ add r8, 12
+ sub r9d, 12
+ cmp r9d, 12
+ jl L_mlkem_rej_uniform_n_avx2_done_128
+ cmp edx, 8
+ jge L_mlkem_rej_uniform_n_avx2_start_128
+L_mlkem_rej_uniform_n_avx2_done_128:
+ cmp r9d, 0
+ je L_mlkem_rej_uniform_n_avx2_done_64
+ cmp edx, 0
+ je L_mlkem_rej_uniform_n_avx2_done_64
+ mov rsi, 1152657617789587455
+ mov r12, 2305878194122661888
+ mov r13, 937044495634074881
+ mov r14, 1152939097061330944
+L_mlkem_rej_uniform_n_avx2_start_64:
+ mov rcx, QWORD PTR [r8]
+ pdep rcx, rcx, rsi
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_0_avx2_rej_large_0
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_n_avx2_done_64
+L_mlkem_rej_uniform_0_avx2_rej_large_0:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_0_avx2_rej_large_1
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_n_avx2_done_64
+L_mlkem_rej_uniform_0_avx2_rej_large_1:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_0_avx2_rej_large_2
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_n_avx2_done_64
+L_mlkem_rej_uniform_0_avx2_rej_large_2:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_0_avx2_rej_large_3
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_n_avx2_done_64
+L_mlkem_rej_uniform_0_avx2_rej_large_3:
+ add r8, 6
+ sub r9d, 6
+ jle L_mlkem_rej_uniform_n_avx2_done_64
+ cmp edx, 0
+ jg L_mlkem_rej_uniform_n_avx2_start_64
+L_mlkem_rej_uniform_n_avx2_done_64:
+ vzeroupper
+ sub eax, edx
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+mlkem_rej_uniform_n_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_rej_uniform_avx2 PROC
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbp
+ mov r10, rcx
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ mov eax, edx
+ cmp edx, 0
+ je L_mlkem_rej_uniform_avx2_done_64
+ cmp edx, 8
+ jl L_mlkem_rej_uniform_avx2_done_128
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_rej_q
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_rej_ones
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_rej_mask
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_rej_shuffle
+ mov r11, QWORD PTR [ptr_L_mlkem_rej_idx]
+ mov rdi, 1229782938247303441
+ mov rbp, 1012195045828461056
+ mov r15, 72340172838076673
+ cmp edx, 32
+ jl L_mlkem_rej_uniform_avx2_done_256
+ vpermq ymm0, [r8], 148
+ vpermq ymm1, [r8+24], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ add r8, 48
+ sub r9d, 48
+ cmp edx, 32
+ jl L_mlkem_rej_uniform_avx2_done_256
+ vpermq ymm0, [r8], 148
+ vpermq ymm1, [r8+24], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ add r8, 48
+ sub r9d, 48
+ cmp edx, 32
+ jl L_mlkem_rej_uniform_avx2_done_256
+L_mlkem_rej_uniform_avx2_start_256:
+ vpermq ymm0, [r8], 148
+ vpermq ymm1, [r8+24], 148
+ vpshufb ymm0, ymm0, ymm9
+ vpshufb ymm1, ymm1, ymm9
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpblendw ymm0, ymm0, ymm2, 170
+ vpblendw ymm1, ymm1, ymm3, 170
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpcmpgtw ymm2, ymm6, ymm0
+ vpcmpgtw ymm3, ymm6, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpmovmskb rbx, ymm2
+ movzx r12d, bl
+ movzx ecx, bh
+ mov r13, rbx
+ mov r14, rbx
+ shr r13, 16
+ shr r14, 24
+ and r13, 255
+ and r14, 255
+ movq xmm2, QWORD PTR [r11+8*r12]
+ movq xmm3, QWORD PTR [r11+8*rcx]
+ movq xmm4, QWORD PTR [r11+8*r13]
+ movq xmm5, QWORD PTR [r11+8*r14]
+ vinserti128 ymm2, ymm2, xmm4, 1
+ vinserti128 ymm3, ymm3, xmm5, 1
+ vpaddb ymm4, ymm2, ymm7
+ vpaddb ymm5, ymm3, ymm7
+ vpunpcklbw ymm2, ymm2, ymm4
+ vpunpcklbw ymm3, ymm3, ymm5
+ vpshufb ymm0, ymm0, ymm2
+ vpshufb ymm1, ymm1, ymm3
+ mov r12, rbx
+ mov r13, rbx
+ mov r14, rbx
+ and rbx, 255
+ shr r12, 16
+ shr r13, 8
+ shr r14, 24
+ and r12, 255
+ and r13, 255
+ popcnt ebx, ebx
+ popcnt r12d, r12d
+ popcnt r13d, r13d
+ popcnt r14d, r14d
+ vmovdqu OWORD PTR [r10], xmm0
+ vextracti128 xmm0, ymm0, 1
+ lea r10, QWORD PTR [r10+2*rbx]
+ sub edx, ebx
+ vmovdqu OWORD PTR [r10], xmm0
+ lea r10, QWORD PTR [r10+2*r12]
+ sub edx, r12d
+ vmovdqu OWORD PTR [r10], xmm1
+ vextracti128 xmm1, ymm1, 1
+ lea r10, QWORD PTR [r10+2*r13]
+ sub edx, r13d
+ vmovdqu OWORD PTR [r10], xmm1
+ lea r10, QWORD PTR [r10+2*r14]
+ sub edx, r14d
+ add r8, 48
+ sub r9d, 48
+ cmp r9d, 48
+ jl L_mlkem_rej_uniform_avx2_done_256
+ cmp edx, 32
+ jge L_mlkem_rej_uniform_avx2_start_256
+L_mlkem_rej_uniform_avx2_done_256:
+ cmp edx, 8
+ jl L_mlkem_rej_uniform_avx2_done_128
+ cmp r9d, 12
+ jl L_mlkem_rej_uniform_avx2_done_128
+L_mlkem_rej_uniform_avx2_start_128:
+ vmovdqu xmm0, OWORD PTR [r8]
+ vpshufb xmm0, xmm0, xmm9
+ vpsrlw xmm2, xmm0, 4
+ vpblendw xmm0, xmm0, xmm2, 170
+ vpand xmm0, xmm0, xmm8
+ vpcmpgtw xmm2, xmm6, xmm0
+ vpmovmskb rbx, xmm2
+ mov r12, 21845
+ pext ebx, ebx, r12d
+ movq xmm3, QWORD PTR [r11+8*rbx]
+ vpaddb xmm4, xmm3, xmm7
+ vpunpcklbw xmm3, xmm3, xmm4
+ vpshufb xmm0, xmm0, xmm3
+ vmovdqu OWORD PTR [r10], xmm0
+ popcnt ecx, ebx
+ lea r10, QWORD PTR [r10+2*rcx]
+ sub edx, ecx
+ add r8, 12
+ sub r9d, 12
+ cmp r9d, 12
+ jl L_mlkem_rej_uniform_avx2_done_128
+ cmp edx, 8
+ jge L_mlkem_rej_uniform_avx2_start_128
+L_mlkem_rej_uniform_avx2_done_128:
+ cmp r9d, 0
+ je L_mlkem_rej_uniform_avx2_done_64
+ cmp edx, 0
+ je L_mlkem_rej_uniform_avx2_done_64
+ mov rsi, 1152657617789587455
+ mov r12, 2305878194122661888
+ mov r13, 937044495634074881
+ mov r14, 1152939097061330944
+L_mlkem_rej_uniform_avx2_start_64:
+ mov rcx, QWORD PTR [r8]
+ pdep rcx, rcx, rsi
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_avx2_rej_large_0
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_avx2_done_64
+L_mlkem_rej_uniform_avx2_rej_large_0:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_avx2_rej_large_1
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_avx2_done_64
+L_mlkem_rej_uniform_avx2_rej_large_1:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_avx2_rej_large_2
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_avx2_done_64
+L_mlkem_rej_uniform_avx2_rej_large_2:
+ shr rcx, 16
+ cmp cx, 3329
+ jge L_mlkem_rej_uniform_avx2_rej_large_3
+ mov WORD PTR [r10], cx
+ add r10, 2
+ sub edx, 1
+ je L_mlkem_rej_uniform_avx2_done_64
+L_mlkem_rej_uniform_avx2_rej_large_3:
+ add r8, 6
+ sub r9d, 6
+ jle L_mlkem_rej_uniform_avx2_done_64
+ cmp edx, 0
+ jg L_mlkem_rej_uniform_avx2_start_64
+L_mlkem_rej_uniform_avx2_done_64:
+ vzeroupper
+ sub eax, edx
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ pop rbp
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+mlkem_rej_uniform_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_249 QWORD 0024924900249249h, 0024924900249249h
+ QWORD 0024924900249249h, 0024924900249249h
+ptr_L_mlkem_mask_249 QWORD L_mlkem_mask_249
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_6db QWORD 006db6db006db6dbh, 006db6db006db6dbh
+ QWORD 006db6db006db6dbh, 006db6db006db6dbh
+ptr_L_mlkem_mask_6db QWORD L_mlkem_mask_6db
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_07 QWORD 0000000700000007h, 0000000700000007h
+ QWORD 0000000700000007h, 0000000700000007h
+ptr_L_mlkem_mask_07 QWORD L_mlkem_mask_07
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_70 QWORD 0007000000070000h, 0007000000070000h
+ QWORD 0007000000070000h, 0007000000070000h
+ptr_L_mlkem_mask_70 QWORD L_mlkem_mask_70
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_3 QWORD 0003000300030003h, 0003000300030003h
+ QWORD 0003000300030003h, 0003000300030003h
+ptr_L_mlkem_mask_3 QWORD L_mlkem_mask_3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_shuff QWORD 0ff050403ff020100h, 0ff0b0a09ff080706h
+ QWORD 0ff090807ff060504h, 0ff0f0e0dff0c0b0ah
+ptr_L_mlkem_shuff QWORD L_mlkem_shuff
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_cbd_eta3_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_mask_249
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_mask_6db
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_mask_07
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_mask_70
+ vmovdqu ymm12, YMMWORD PTR L_mlkem_mask_3
+ vmovdqu ymm13, YMMWORD PTR L_mlkem_shuff
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+24]
+ vpermq ymm0, ymm0, 148
+ vpermq ymm1, ymm1, 148
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vpsrld ymm2, ymm0, 1
+ vpsrld ymm3, ymm1, 1
+ vpsrld ymm4, ymm0, 2
+ vpsrld ymm5, ymm1, 2
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpsrld ymm2, ymm0, 3
+ vpsrld ymm3, ymm1, 3
+ vpaddd ymm0, ymm0, ymm9
+ vpaddd ymm1, ymm1, ymm9
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm3
+ vpslld ymm2, ymm0, 10
+ vpslld ymm3, ymm1, 10
+ vpsrld ymm4, ymm0, 12
+ vpsrld ymm5, ymm1, 12
+ vpsrld ymm6, ymm0, 2
+ vpsrld ymm7, ymm1, 2
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm11
+ vpand ymm7, ymm7, ymm11
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm2, ymm4, ymm6
+ vpaddw ymm3, ymm5, ymm7
+ vpsubw ymm0, ymm0, ymm12
+ vpsubw ymm1, ymm1, ymm12
+ vpsubw ymm2, ymm2, ymm12
+ vpsubw ymm3, ymm3, ymm12
+ vpunpckldq ymm4, ymm0, ymm2
+ vpunpckldq ymm5, ymm1, ymm3
+ vpunpckhdq ymm6, ymm0, ymm2
+ vpunpckhdq ymm7, ymm1, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 32
+ vperm2i128 ymm1, ymm5, ymm7, 32
+ vperm2i128 ymm2, ymm4, ymm6, 49
+ vperm2i128 ymm3, ymm5, ymm7, 49
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+48]
+ vmovdqu ymm1, YMMWORD PTR [rdx+72]
+ vpermq ymm0, ymm0, 148
+ vpermq ymm1, ymm1, 148
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vpsrld ymm2, ymm0, 1
+ vpsrld ymm3, ymm1, 1
+ vpsrld ymm4, ymm0, 2
+ vpsrld ymm5, ymm1, 2
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpsrld ymm2, ymm0, 3
+ vpsrld ymm3, ymm1, 3
+ vpaddd ymm0, ymm0, ymm9
+ vpaddd ymm1, ymm1, ymm9
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm3
+ vpslld ymm2, ymm0, 10
+ vpslld ymm3, ymm1, 10
+ vpsrld ymm4, ymm0, 12
+ vpsrld ymm5, ymm1, 12
+ vpsrld ymm6, ymm0, 2
+ vpsrld ymm7, ymm1, 2
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm11
+ vpand ymm7, ymm7, ymm11
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm2, ymm4, ymm6
+ vpaddw ymm3, ymm5, ymm7
+ vpsubw ymm0, ymm0, ymm12
+ vpsubw ymm1, ymm1, ymm12
+ vpsubw ymm2, ymm2, ymm12
+ vpsubw ymm3, ymm3, ymm12
+ vpunpckldq ymm4, ymm0, ymm2
+ vpunpckldq ymm5, ymm1, ymm3
+ vpunpckhdq ymm6, ymm0, ymm2
+ vpunpckhdq ymm7, ymm1, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 32
+ vperm2i128 ymm1, ymm5, ymm7, 32
+ vperm2i128 ymm2, ymm4, ymm6, 49
+ vperm2i128 ymm3, ymm5, ymm7, 49
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm2
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+96]
+ vmovdqu ymm1, YMMWORD PTR [rdx+120]
+ vpermq ymm0, ymm0, 148
+ vpermq ymm1, ymm1, 148
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vpsrld ymm2, ymm0, 1
+ vpsrld ymm3, ymm1, 1
+ vpsrld ymm4, ymm0, 2
+ vpsrld ymm5, ymm1, 2
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpsrld ymm2, ymm0, 3
+ vpsrld ymm3, ymm1, 3
+ vpaddd ymm0, ymm0, ymm9
+ vpaddd ymm1, ymm1, ymm9
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm3
+ vpslld ymm2, ymm0, 10
+ vpslld ymm3, ymm1, 10
+ vpsrld ymm4, ymm0, 12
+ vpsrld ymm5, ymm1, 12
+ vpsrld ymm6, ymm0, 2
+ vpsrld ymm7, ymm1, 2
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm11
+ vpand ymm7, ymm7, ymm11
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm2, ymm4, ymm6
+ vpaddw ymm3, ymm5, ymm7
+ vpsubw ymm0, ymm0, ymm12
+ vpsubw ymm1, ymm1, ymm12
+ vpsubw ymm2, ymm2, ymm12
+ vpsubw ymm3, ymm3, ymm12
+ vpunpckldq ymm4, ymm0, ymm2
+ vpunpckldq ymm5, ymm1, ymm3
+ vpunpckhdq ymm6, ymm0, ymm2
+ vpunpckhdq ymm7, ymm1, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 32
+ vperm2i128 ymm1, ymm5, ymm7, 32
+ vperm2i128 ymm2, ymm4, ymm6, 49
+ vperm2i128 ymm3, ymm5, ymm7, 49
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm2
+ vmovdqu YMMWORD PTR [rcx+320], ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu ymm0, YMMWORD PTR [rdx+144]
+ vmovdqu ymm1, YMMWORD PTR [rdx+168]
+ vpermq ymm0, ymm0, 148
+ vpermq ymm1, ymm1, 148
+ vpshufb ymm0, ymm0, ymm13
+ vpshufb ymm1, ymm1, ymm13
+ vpsrld ymm2, ymm0, 1
+ vpsrld ymm3, ymm1, 1
+ vpsrld ymm4, ymm0, 2
+ vpsrld ymm5, ymm1, 2
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpaddd ymm0, ymm0, ymm2
+ vpaddd ymm1, ymm1, ymm3
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpsrld ymm2, ymm0, 3
+ vpsrld ymm3, ymm1, 3
+ vpaddd ymm0, ymm0, ymm9
+ vpaddd ymm1, ymm1, ymm9
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm3
+ vpslld ymm2, ymm0, 10
+ vpslld ymm3, ymm1, 10
+ vpsrld ymm4, ymm0, 12
+ vpsrld ymm5, ymm1, 12
+ vpsrld ymm6, ymm0, 2
+ vpsrld ymm7, ymm1, 2
+ vpand ymm0, ymm0, ymm10
+ vpand ymm1, ymm1, ymm10
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm10
+ vpand ymm5, ymm5, ymm10
+ vpand ymm6, ymm6, ymm11
+ vpand ymm7, ymm7, ymm11
+ vpaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm3
+ vpaddw ymm2, ymm4, ymm6
+ vpaddw ymm3, ymm5, ymm7
+ vpsubw ymm0, ymm0, ymm12
+ vpsubw ymm1, ymm1, ymm12
+ vpsubw ymm2, ymm2, ymm12
+ vpsubw ymm3, ymm3, ymm12
+ vpunpckldq ymm4, ymm0, ymm2
+ vpunpckldq ymm5, ymm1, ymm3
+ vpunpckhdq ymm6, ymm0, ymm2
+ vpunpckhdq ymm7, ymm1, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 32
+ vperm2i128 ymm1, ymm5, ymm7, 32
+ vperm2i128 ymm2, ymm4, ymm6, 49
+ vperm2i128 ymm3, ymm5, ymm7, 49
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm1
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+mlkem_cbd_eta3_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_55 QWORD 5555555555555555h, 5555555555555555h
+ QWORD 5555555555555555h, 5555555555555555h
+ptr_L_mlkem_mask_55 QWORD L_mlkem_mask_55
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_33 QWORD 3333333333333333h, 3333333333333333h
+ QWORD 3333333333333333h, 3333333333333333h
+ptr_L_mlkem_mask_33 QWORD L_mlkem_mask_33
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_03 QWORD 0303030303030303h, 0303030303030303h
+ QWORD 0303030303030303h, 0303030303030303h
+ptr_L_mlkem_mask_03 QWORD L_mlkem_mask_03
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_mask_0f QWORD 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
+ QWORD 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
+ptr_L_mlkem_mask_0f QWORD L_mlkem_mask_0f
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_cbd_eta2_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_mask_55
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_mask_33
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_mask_03
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_mask_0f
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vpsrlw ymm2, ymm0, 1
+ vpsrlw ymm3, ymm1, 1
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddb ymm0, ymm0, ymm2
+ vpaddb ymm1, ymm1, ymm3
+ vpsrlw ymm2, ymm0, 2
+ vpsrlw ymm3, ymm1, 2
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpaddb ymm0, ymm0, ymm9
+ vpaddb ymm1, ymm1, ymm9
+ vpsubb ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm3
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpand ymm0, ymm0, ymm11
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpsubb ymm0, ymm0, ymm10
+ vpsubb ymm1, ymm1, ymm10
+ vpsubb ymm2, ymm2, ymm10
+ vpsubb ymm3, ymm3, ymm10
+ vpunpcklbw ymm4, ymm0, ymm2
+ vpunpcklbw ymm5, ymm1, ymm3
+ vpunpckhbw ymm6, ymm0, ymm2
+ vpunpckhbw ymm7, ymm1, ymm3
+ vpmovsxbw ymm0, xmm4
+ vpmovsxbw ymm1, xmm5
+ vextracti128 xmm2, ymm4, 1
+ vextracti128 xmm3, ymm5, 1
+ vpmovsxbw ymm2, xmm2
+ vpmovsxbw ymm3, xmm3
+ vpmovsxbw ymm4, xmm6
+ vpmovsxbw ymm5, xmm7
+ vextracti128 xmm6, ymm6, 1
+ vextracti128 xmm7, ymm7, 1
+ vpmovsxbw ymm6, xmm6
+ vpmovsxbw ymm7, xmm7
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm6
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm3
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm1, YMMWORD PTR [rdx+96]
+ vpsrlw ymm2, ymm0, 1
+ vpsrlw ymm3, ymm1, 1
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpaddb ymm0, ymm0, ymm2
+ vpaddb ymm1, ymm1, ymm3
+ vpsrlw ymm2, ymm0, 2
+ vpsrlw ymm3, ymm1, 2
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpaddb ymm0, ymm0, ymm9
+ vpaddb ymm1, ymm1, ymm9
+ vpsubb ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm3
+ vpsrlw ymm2, ymm0, 4
+ vpsrlw ymm3, ymm1, 4
+ vpand ymm0, ymm0, ymm11
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpsubb ymm0, ymm0, ymm10
+ vpsubb ymm1, ymm1, ymm10
+ vpsubb ymm2, ymm2, ymm10
+ vpsubb ymm3, ymm3, ymm10
+ vpunpcklbw ymm4, ymm0, ymm2
+ vpunpcklbw ymm5, ymm1, ymm3
+ vpunpckhbw ymm6, ymm0, ymm2
+ vpunpckhbw ymm7, ymm1, ymm3
+ vpmovsxbw ymm0, xmm4
+ vpmovsxbw ymm1, xmm5
+ vextracti128 xmm2, ymm4, 1
+ vextracti128 xmm3, ymm5, 1
+ vpmovsxbw ymm2, xmm2
+ vpmovsxbw ymm3, xmm3
+ vpmovsxbw ymm4, xmm6
+ vpmovsxbw ymm5, xmm7
+ vextracti128 xmm6, ymm6, 1
+ vextracti128 xmm7, ymm7, 1
+ vpmovsxbw ymm6, xmm6
+ vpmovsxbw ymm7, xmm7
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm4
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm6
+ vmovdqu YMMWORD PTR [rcx+384], ymm1
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+mlkem_cbd_eta2_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_mask WORD 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh
+ WORD 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh
+ptr_L_mlkem_compress_10_avx2_mask QWORD L_mlkem_compress_10_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_shift QWORD 0400000104000001h, 0400000104000001h
+ QWORD 0400000104000001h, 0400000104000001h
+ptr_L_mlkem_compress_10_avx2_shift QWORD L_mlkem_compress_10_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_shlv QWORD 000000000000000ch, 000000000000000ch
+ QWORD 000000000000000ch, 000000000000000ch
+ptr_L_mlkem_compress_10_avx2_shlv QWORD L_mlkem_compress_10_avx2_shlv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_shuf BYTE 00h, 01h, 02h, 03h, 04h, 08h, 09h, 0ah
+ BYTE 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 09h, 0ah, 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 00h, 01h, 02h, 03h, 04h, 08h
+ptr_L_mlkem_compress_10_avx2_shuf QWORD L_mlkem_compress_10_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ptr_L_mlkem_compress_10_avx2_v QWORD L_mlkem_compress_10_avx2_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_offset WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ptr_L_mlkem_compress_10_avx2_offset QWORD L_mlkem_compress_10_avx2_offset
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_10_avx2_shift12 WORD 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h
+ WORD 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h
+ptr_L_mlkem_compress_10_avx2_shift12 QWORD L_mlkem_compress_10_avx2_shift12
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_compress_10_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_10_avx2_mask
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_10_avx2_shift
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_10_avx2_shlv
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_10_avx2_shuf
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_compress_10_avx2_v
+ vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_10_avx2_offset
+ vmovdqu ymm13, YMMWORD PTR L_mlkem_compress_10_avx2_shift12
+ vpsllw ymm7, ymm6, 3
+L_mlkem_compress_10_avx2_start:
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx], xmm0
+ vmovdqu OWORD PTR [rcx+20], xmm1
+ vmovss DWORD PTR [rcx+16], xmm2
+ vmovss DWORD PTR [rcx+36], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm1, YMMWORD PTR [rdx+96]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+40], xmm0
+ vmovdqu OWORD PTR [rcx+60], xmm1
+ vmovss DWORD PTR [rcx+56], xmm2
+ vmovss DWORD PTR [rcx+76], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm1, YMMWORD PTR [rdx+160]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+80], xmm0
+ vmovdqu OWORD PTR [rcx+100], xmm1
+ vmovss DWORD PTR [rcx+96], xmm2
+ vmovss DWORD PTR [rcx+116], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm1, YMMWORD PTR [rdx+224]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+120], xmm0
+ vmovdqu OWORD PTR [rcx+140], xmm1
+ vmovss DWORD PTR [rcx+136], xmm2
+ vmovss DWORD PTR [rcx+156], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+160], xmm0
+ vmovdqu OWORD PTR [rcx+180], xmm1
+ vmovss DWORD PTR [rcx+176], xmm2
+ vmovss DWORD PTR [rcx+196], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm1, YMMWORD PTR [rdx+352]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+200], xmm0
+ vmovdqu OWORD PTR [rcx+220], xmm1
+ vmovss DWORD PTR [rcx+216], xmm2
+ vmovss DWORD PTR [rcx+236], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm1, YMMWORD PTR [rdx+416]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+240], xmm0
+ vmovdqu OWORD PTR [rcx+260], xmm1
+ vmovss DWORD PTR [rcx+256], xmm2
+ vmovss DWORD PTR [rcx+276], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm1, YMMWORD PTR [rdx+480]
+ vpmullw ymm2, ymm0, ymm7
+ vpmullw ymm4, ymm1, ymm7
+ vpaddw ymm3, ymm0, ymm12
+ vpaddw ymm5, ymm1, ymm12
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm1, ymm1, 3
+ vpmulhuw ymm0, ymm0, ymm6
+ vpmulhuw ymm1, ymm1, ymm6
+ vpsubw ymm3, ymm2, ymm3
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm2, ymm2, ymm3
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm2, ymm2, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm2
+ vpsubw ymm1, ymm1, ymm4
+ vpmulhrsw ymm0, ymm0, ymm13
+ vpmulhrsw ymm1, ymm1, ymm13
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpmaddwd ymm0, ymm0, ymm8
+ vpmaddwd ymm1, ymm1, ymm8
+ vpsllvd ymm0, ymm0, ymm10
+ vpsllvd ymm1, ymm1, ymm10
+ vpsrlq ymm0, ymm0, 12
+ vpsrlq ymm1, ymm1, 12
+ vpshufb ymm0, ymm0, ymm11
+ vpshufb ymm1, ymm1, ymm11
+ vextracti128 xmm2, ymm0, 1
+ vextracti128 xmm4, ymm1, 1
+ vpblendw xmm0, xmm0, xmm2, 224
+ vpblendw xmm1, xmm1, xmm4, 224
+ vmovdqu OWORD PTR [rcx+280], xmm0
+ vmovdqu OWORD PTR [rcx+300], xmm1
+ vmovss DWORD PTR [rcx+296], xmm2
+ vmovss DWORD PTR [rcx+316], xmm4
+ add rcx, 320
+ add rdx, 512
+ sub r8d, 1
+ jg L_mlkem_compress_10_avx2_start
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+mlkem_compress_10_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_10_avx2_mask DWORD 7fe01ff8h, 7fe01ff8h, 7fe01ff8h, 7fe01ff8h
+ DWORD 7fe01ff8h, 7fe01ff8h, 7fe01ff8h, 7fe01ff8h
+ptr_L_mlkem_decompress_10_avx2_mask QWORD L_mlkem_decompress_10_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_10_avx2_sllv QWORD 0000000000000004h, 0000000000000004h
+ QWORD 0000000000000004h, 0000000000000004h
+ptr_L_mlkem_decompress_10_avx2_sllv QWORD L_mlkem_decompress_10_avx2_sllv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_10_avx2_q DWORD 0d013404h, 0d013404h, 0d013404h, 0d013404h
+ DWORD 0d013404h, 0d013404h, 0d013404h, 0d013404h
+ptr_L_mlkem_decompress_10_avx2_q QWORD L_mlkem_decompress_10_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_10_avx2_shuf BYTE 00h, 01h, 01h, 02h, 02h, 03h, 03h, 04h
+ BYTE 05h, 06h, 06h, 07h, 07h, 08h, 08h, 09h
+ BYTE 02h, 03h, 03h, 04h, 04h, 05h, 05h, 06h
+ BYTE 07h, 08h, 08h, 09h, 09h, 0ah, 0ah, 0bh
+ptr_L_mlkem_decompress_10_avx2_shuf QWORD L_mlkem_decompress_10_avx2_shuf
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_decompress_10_avx2 PROC
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_10_avx2_mask
+ vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_10_avx2_q
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_10_avx2_shuf
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_10_avx2_sllv
+L_mlkem_decompress_10_avx2_start:
+ vpermq ymm0, [rdx], 148
+ vpermq ymm1, [rdx+20], 148
+ vpermq ymm2, [rdx+40], 148
+ vpermq ymm3, [rdx+60], 148
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm5
+ vpmulhrsw ymm1, ymm1, ymm5
+ vpmulhrsw ymm2, ymm2, ymm5
+ vpmulhrsw ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vpermq ymm0, [rdx+80], 148
+ vpermq ymm1, [rdx+100], 148
+ vpermq ymm2, [rdx+120], 148
+ vpermq ymm3, [rdx+140], 148
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm5
+ vpmulhrsw ymm1, ymm1, ymm5
+ vpmulhrsw ymm2, ymm2, ymm5
+ vpmulhrsw ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm2
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vpermq ymm0, [rdx+160], 148
+ vpermq ymm1, [rdx+180], 148
+ vpermq ymm2, [rdx+200], 148
+ vpermq ymm3, [rdx+220], 148
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm5
+ vpmulhrsw ymm1, ymm1, ymm5
+ vpmulhrsw ymm2, ymm2, ymm5
+ vpmulhrsw ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vpermq ymm0, [rdx+240], 148
+ vpermq ymm1, [rdx+260], 148
+ vpermq ymm2, [rdx+280], 148
+ vpermq ymm3, [rdx+300], 148
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsllvd ymm1, ymm1, ymm7
+ vpsllvd ymm2, ymm2, ymm7
+ vpsllvd ymm3, ymm3, ymm7
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm5
+ vpmulhrsw ymm1, ymm1, ymm5
+ vpmulhrsw ymm2, ymm2, ymm5
+ vpmulhrsw ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ add rdx, 320
+ add rcx, 512
+ sub r8d, 1
+ jg L_mlkem_decompress_10_avx2_start
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+mlkem_decompress_10_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ptr_L_mlkem_compress_11_avx2_v QWORD L_mlkem_compress_11_avx2_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_off WORD 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h
+ WORD 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h
+ptr_L_mlkem_compress_11_avx2_off QWORD L_mlkem_compress_11_avx2_off
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_shift13 WORD 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h
+ WORD 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h
+ptr_L_mlkem_compress_11_avx2_shift13 QWORD L_mlkem_compress_11_avx2_shift13
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_mask WORD 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh
+ WORD 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh
+ptr_L_mlkem_compress_11_avx2_mask QWORD L_mlkem_compress_11_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_shift QWORD 0800000108000001h, 0800000108000001h
+ QWORD 0800000108000001h, 0800000108000001h
+ptr_L_mlkem_compress_11_avx2_shift QWORD L_mlkem_compress_11_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_sllvd DWORD 0000000ah, 00000000h, 0000000ah, 00000000h
+ DWORD 0000000ah, 00000000h, 0000000ah, 00000000h
+ptr_L_mlkem_compress_11_avx2_sllvd QWORD L_mlkem_compress_11_avx2_sllvd
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_srlvq QWORD 000000000000000ah, 000000000000001eh
+ QWORD 000000000000000ah, 000000000000001eh
+ptr_L_mlkem_compress_11_avx2_srlvq QWORD L_mlkem_compress_11_avx2_srlvq
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_11_avx2_shuf BYTE 00h, 01h, 02h, 03h, 04h, 05h, 06h, 07h
+ BYTE 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 06h, 07h, 08h, 09h, 0ah, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 00h, 00h, 01h, 02h, 03h, 04h
+ptr_L_mlkem_compress_11_avx2_shuf QWORD L_mlkem_compress_11_avx2_shuf
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_compress_11_avx2 PROC
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_compress_11_avx2_v
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_11_avx2_off
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_11_avx2_shift13
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_11_avx2_mask
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_11_avx2_shift
+ vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_11_avx2_sllvd
+ vmovdqu ymm13, YMMWORD PTR L_mlkem_compress_11_avx2_srlvq
+ vmovdqu ymm14, YMMWORD PTR L_mlkem_compress_11_avx2_shuf
+ vpsllw ymm6, ymm7, 3
+L_mlkem_compress_11_avx2_start:
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm3, YMMWORD PTR [rdx+32]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx], xmm0
+ vmovq QWORD PTR [rcx+16], xmm1
+ vmovdqu OWORD PTR [rcx+22], xmm3
+ vmovq QWORD PTR [rcx+38], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+44], xmm0
+ vmovq QWORD PTR [rcx+60], xmm1
+ vmovdqu OWORD PTR [rcx+66], xmm3
+ vmovq QWORD PTR [rcx+82], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+128]
+ vmovdqu ymm3, YMMWORD PTR [rdx+160]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+88], xmm0
+ vmovq QWORD PTR [rcx+104], xmm1
+ vmovdqu OWORD PTR [rcx+110], xmm3
+ vmovq QWORD PTR [rcx+126], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+132], xmm0
+ vmovq QWORD PTR [rcx+148], xmm1
+ vmovdqu OWORD PTR [rcx+154], xmm3
+ vmovq QWORD PTR [rcx+170], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm3, YMMWORD PTR [rdx+288]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+176], xmm0
+ vmovq QWORD PTR [rcx+192], xmm1
+ vmovdqu OWORD PTR [rcx+198], xmm3
+ vmovq QWORD PTR [rcx+214], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+220], xmm0
+ vmovq QWORD PTR [rcx+236], xmm1
+ vmovdqu OWORD PTR [rcx+242], xmm3
+ vmovq QWORD PTR [rcx+258], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+384]
+ vmovdqu ymm3, YMMWORD PTR [rdx+416]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+264], xmm0
+ vmovq QWORD PTR [rcx+280], xmm1
+ vmovdqu OWORD PTR [rcx+286], xmm3
+ vmovq QWORD PTR [rcx+302], xmm4
+ vmovdqu ymm0, YMMWORD PTR [rdx+448]
+ vmovdqu ymm3, YMMWORD PTR [rdx+480]
+ vpmullw ymm1, ymm0, ymm6
+ vpmullw ymm4, ymm3, ymm6
+ vpaddw ymm2, ymm0, ymm8
+ vpaddw ymm5, ymm3, ymm8
+ vpsllw ymm0, ymm0, 3
+ vpsllw ymm3, ymm3, 3
+ vpmulhw ymm0, ymm0, ymm7
+ vpmulhw ymm3, ymm3, ymm7
+ vpsubw ymm2, ymm1, ymm2
+ vpsubw ymm5, ymm4, ymm5
+ vpandn ymm1, ymm1, ymm2
+ vpandn ymm4, ymm4, ymm5
+ vpsrlw ymm1, ymm1, 15
+ vpsrlw ymm4, ymm4, 15
+ vpsubw ymm0, ymm0, ymm1
+ vpsubw ymm3, ymm3, ymm4
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm10
+ vpand ymm3, ymm3, ymm10
+ vpmaddwd ymm0, ymm0, ymm11
+ vpmaddwd ymm3, ymm3, ymm11
+ vpsllvd ymm0, ymm0, ymm12
+ vpsllvd ymm3, ymm3, ymm12
+ vpsrldq ymm1, ymm0, 8
+ vpsrldq ymm4, ymm3, 8
+ vpsrlvq ymm0, ymm0, ymm13
+ vpsrlvq ymm3, ymm3, ymm13
+ vpsllq ymm1, ymm1, 34
+ vpsllq ymm4, ymm4, 34
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm3, ymm3, ymm4
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vextracti128 xmm1, ymm0, 1
+ vextracti128 xmm4, ymm3, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm14
+ vpblendvb xmm3, xmm3, xmm4, xmm14
+ vmovdqu OWORD PTR [rcx+308], xmm0
+ vmovq QWORD PTR [rcx+324], xmm1
+ vmovdqu OWORD PTR [rcx+330], xmm3
+ vmovq QWORD PTR [rcx+346], xmm4
+ add rcx, 352
+ add rdx, 512
+ sub r8d, 1
+ jg L_mlkem_compress_11_avx2_start
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ ret
+mlkem_compress_11_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ptr_L_mlkem_decompress_11_avx2_q QWORD L_mlkem_decompress_11_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_shuf BYTE 00h, 01h, 01h, 02h, 02h, 03h, 04h, 05h
+ BYTE 05h, 06h, 06h, 07h, 08h, 09h, 09h, 0ah
+ BYTE 03h, 04h, 04h, 05h, 05h, 06h, 07h, 08h
+ BYTE 08h, 09h, 09h, 0ah, 0bh, 0ch, 0ch, 0dh
+ptr_L_mlkem_decompress_11_avx2_shuf QWORD L_mlkem_decompress_11_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_sllv DWORD 00000000h, 00000001h, 00000000h, 00000000h
+ DWORD 00000000h, 00000001h, 00000000h, 00000000h
+ptr_L_mlkem_decompress_11_avx2_sllv QWORD L_mlkem_decompress_11_avx2_sllv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_srlv QWORD 0000000000000000h, 0000000000000002h
+ QWORD 0000000000000000h, 0000000000000002h
+ptr_L_mlkem_decompress_11_avx2_srlv QWORD L_mlkem_decompress_11_avx2_srlv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_shift WORD 0020h, 0004h, 0001h, 0020h, 0008h, 0001h, 0020h, 0004h
+ WORD 0020h, 0004h, 0001h, 0020h, 0008h, 0001h, 0020h, 0004h
+ptr_L_mlkem_decompress_11_avx2_shift QWORD L_mlkem_decompress_11_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_11_avx2_mask WORD 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h
+ WORD 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h
+ptr_L_mlkem_decompress_11_avx2_mask QWORD L_mlkem_decompress_11_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_decompress_11_avx2 PROC
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_11_avx2_q
+ vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_11_avx2_shuf
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_11_avx2_sllv
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_11_avx2_srlv
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_decompress_11_avx2_shift
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_decompress_11_avx2_mask
+L_mlkem_decompress_11_avx2_start:
+ vpermq ymm0, [rdx], 148
+ vpermq ymm1, [rdx+22], 148
+ vpermq ymm2, [rdx+44], 148
+ vpermq ymm3, [rdx+66], 148
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vpshufb ymm2, ymm2, ymm5
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm0, ymm0, ymm6
+ vpsrlvd ymm1, ymm1, ymm6
+ vpsrlvd ymm2, ymm2, ymm6
+ vpsrlvd ymm3, ymm3, ymm6
+ vpsrlvq ymm0, ymm0, ymm7
+ vpsrlvq ymm1, ymm1, ymm7
+ vpsrlvq ymm2, ymm2, ymm7
+ vpsrlvq ymm3, ymm3, ymm7
+ vpmullw ymm0, ymm0, ymm8
+ vpmullw ymm1, ymm1, ymm8
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpmulhrsw ymm0, ymm0, ymm4
+ vpmulhrsw ymm1, ymm1, ymm4
+ vpmulhrsw ymm2, ymm2, ymm4
+ vpmulhrsw ymm3, ymm3, ymm4
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vpermq ymm0, [rdx+88], 148
+ vpermq ymm1, [rdx+110], 148
+ vpermq ymm2, [rdx+132], 148
+ vpermq ymm3, [rdx+154], 148
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vpshufb ymm2, ymm2, ymm5
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm0, ymm0, ymm6
+ vpsrlvd ymm1, ymm1, ymm6
+ vpsrlvd ymm2, ymm2, ymm6
+ vpsrlvd ymm3, ymm3, ymm6
+ vpsrlvq ymm0, ymm0, ymm7
+ vpsrlvq ymm1, ymm1, ymm7
+ vpsrlvq ymm2, ymm2, ymm7
+ vpsrlvq ymm3, ymm3, ymm7
+ vpmullw ymm0, ymm0, ymm8
+ vpmullw ymm1, ymm1, ymm8
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpmulhrsw ymm0, ymm0, ymm4
+ vpmulhrsw ymm1, ymm1, ymm4
+ vpmulhrsw ymm2, ymm2, ymm4
+ vpmulhrsw ymm3, ymm3, ymm4
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm2
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vpermq ymm0, [rdx+176], 148
+ vpermq ymm1, [rdx+198], 148
+ vpermq ymm2, [rdx+220], 148
+ vpermq ymm3, [rdx+242], 148
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vpshufb ymm2, ymm2, ymm5
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm0, ymm0, ymm6
+ vpsrlvd ymm1, ymm1, ymm6
+ vpsrlvd ymm2, ymm2, ymm6
+ vpsrlvd ymm3, ymm3, ymm6
+ vpsrlvq ymm0, ymm0, ymm7
+ vpsrlvq ymm1, ymm1, ymm7
+ vpsrlvq ymm2, ymm2, ymm7
+ vpsrlvq ymm3, ymm3, ymm7
+ vpmullw ymm0, ymm0, ymm8
+ vpmullw ymm1, ymm1, ymm8
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpmulhrsw ymm0, ymm0, ymm4
+ vpmulhrsw ymm1, ymm1, ymm4
+ vpmulhrsw ymm2, ymm2, ymm4
+ vpmulhrsw ymm3, ymm3, ymm4
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vpermq ymm0, [rdx+264], 148
+ vpermq ymm1, [rdx+286], 148
+ vpermq ymm2, [rdx+308], 148
+ vpermq ymm3, [rdx+330], 148
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vpshufb ymm2, ymm2, ymm5
+ vpshufb ymm3, ymm3, ymm5
+ vpsrlvd ymm0, ymm0, ymm6
+ vpsrlvd ymm1, ymm1, ymm6
+ vpsrlvd ymm2, ymm2, ymm6
+ vpsrlvd ymm3, ymm3, ymm6
+ vpsrlvq ymm0, ymm0, ymm7
+ vpsrlvq ymm1, ymm1, ymm7
+ vpsrlvq ymm2, ymm2, ymm7
+ vpsrlvq ymm3, ymm3, ymm7
+ vpmullw ymm0, ymm0, ymm8
+ vpmullw ymm1, ymm1, ymm8
+ vpmullw ymm2, ymm2, ymm8
+ vpmullw ymm3, ymm3, ymm8
+ vpsrlw ymm0, ymm0, 1
+ vpsrlw ymm1, ymm1, 1
+ vpsrlw ymm2, ymm2, 1
+ vpsrlw ymm3, ymm3, 1
+ vpand ymm0, ymm0, ymm9
+ vpand ymm1, ymm1, ymm9
+ vpand ymm2, ymm2, ymm9
+ vpand ymm3, ymm3, ymm9
+ vpmulhrsw ymm0, ymm0, ymm4
+ vpmulhrsw ymm1, ymm1, ymm4
+ vpmulhrsw ymm2, ymm2, ymm4
+ vpmulhrsw ymm3, ymm3, ymm4
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ add rdx, 352
+ add rcx, 512
+ sub r8d, 1
+ jg L_mlkem_decompress_11_avx2_start
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ ret
+mlkem_decompress_11_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_4_avx2_mask WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh
+ptr_L_mlkem_compress_4_avx2_mask QWORD L_mlkem_compress_4_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_4_avx2_shift WORD 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h
+ WORD 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h
+ptr_L_mlkem_compress_4_avx2_shift QWORD L_mlkem_compress_4_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_4_avx2_perm DWORD 00000000h, 00000004h, 00000001h, 00000005h
+ DWORD 00000002h, 00000006h, 00000003h, 00000007h
+ptr_L_mlkem_compress_4_avx2_perm QWORD L_mlkem_compress_4_avx2_perm
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_4_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ptr_L_mlkem_compress_4_avx2_v QWORD L_mlkem_compress_4_avx2_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_4_avx2_shift12 WORD 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h
+ WORD 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h
+ptr_L_mlkem_compress_4_avx2_shift12 QWORD L_mlkem_compress_4_avx2_shift12
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_compress_4_avx2 PROC
+ sub rsp, 112
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_4_avx2_mask
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_4_avx2_shift
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_4_avx2_perm
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_4_avx2_v
+ vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_4_avx2_shift12
+ vpmulhw ymm0, ymm11, [rdx]
+ vpmulhw ymm1, ymm11, [rdx+32]
+ vpmulhw ymm2, ymm11, [rdx+64]
+ vpmulhw ymm3, ymm11, [rdx+96]
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm1, ymm1, ymm9
+ vpmulhrsw ymm2, ymm2, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpmaddubsw ymm0, ymm0, ymm12
+ vpmaddubsw ymm2, ymm2, ymm12
+ vpackuswb ymm0, ymm0, ymm2
+ vpmulhw ymm4, ymm11, [rdx+128]
+ vpmulhw ymm5, ymm11, [rdx+160]
+ vpmulhw ymm6, ymm11, [rdx+192]
+ vpmulhw ymm7, ymm11, [rdx+224]
+ vpmulhrsw ymm4, ymm4, ymm9
+ vpmulhrsw ymm5, ymm5, ymm9
+ vpmulhrsw ymm6, ymm6, ymm9
+ vpmulhrsw ymm7, ymm7, ymm9
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpand ymm6, ymm6, ymm8
+ vpand ymm7, ymm7, ymm8
+ vpackuswb ymm4, ymm4, ymm5
+ vpackuswb ymm6, ymm6, ymm7
+ vpmaddubsw ymm4, ymm4, ymm12
+ vpmaddubsw ymm6, ymm6, ymm12
+ vpackuswb ymm4, ymm4, ymm6
+ vpermd ymm0, ymm10, ymm0
+ vpermd ymm4, ymm10, ymm4
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm4
+ vpmulhw ymm0, ymm11, [rdx+256]
+ vpmulhw ymm1, ymm11, [rdx+288]
+ vpmulhw ymm2, ymm11, [rdx+320]
+ vpmulhw ymm3, ymm11, [rdx+352]
+ vpmulhrsw ymm0, ymm0, ymm9
+ vpmulhrsw ymm1, ymm1, ymm9
+ vpmulhrsw ymm2, ymm2, ymm9
+ vpmulhrsw ymm3, ymm3, ymm9
+ vpand ymm0, ymm0, ymm8
+ vpand ymm1, ymm1, ymm8
+ vpand ymm2, ymm2, ymm8
+ vpand ymm3, ymm3, ymm8
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpmaddubsw ymm0, ymm0, ymm12
+ vpmaddubsw ymm2, ymm2, ymm12
+ vpackuswb ymm0, ymm0, ymm2
+ vpmulhw ymm4, ymm11, [rdx+384]
+ vpmulhw ymm5, ymm11, [rdx+416]
+ vpmulhw ymm6, ymm11, [rdx+448]
+ vpmulhw ymm7, ymm11, [rdx+480]
+ vpmulhrsw ymm4, ymm4, ymm9
+ vpmulhrsw ymm5, ymm5, ymm9
+ vpmulhrsw ymm6, ymm6, ymm9
+ vpmulhrsw ymm7, ymm7, ymm9
+ vpand ymm4, ymm4, ymm8
+ vpand ymm5, ymm5, ymm8
+ vpand ymm6, ymm6, ymm8
+ vpand ymm7, ymm7, ymm8
+ vpackuswb ymm4, ymm4, ymm5
+ vpackuswb ymm6, ymm6, ymm7
+ vpmaddubsw ymm4, ymm4, ymm12
+ vpmaddubsw ymm6, ymm6, ymm12
+ vpackuswb ymm4, ymm4, ymm6
+ vpermd ymm0, ymm10, ymm0
+ vpermd ymm4, ymm10, ymm4
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vmovdqu YMMWORD PTR [rcx+96], ymm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ add rsp, 112
+ ret
+mlkem_compress_4_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_4_avx2_mask DWORD 00f0000fh, 00f0000fh, 00f0000fh, 00f0000fh
+ DWORD 00f0000fh, 00f0000fh, 00f0000fh, 00f0000fh
+ptr_L_mlkem_decompress_4_avx2_mask QWORD L_mlkem_decompress_4_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_4_avx2_shift DWORD 00800800h, 00800800h, 00800800h, 00800800h
+ DWORD 00800800h, 00800800h, 00800800h, 00800800h
+ptr_L_mlkem_decompress_4_avx2_shift QWORD L_mlkem_decompress_4_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_4_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ptr_L_mlkem_decompress_4_avx2_q QWORD L_mlkem_decompress_4_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_4_avx2_shuf BYTE 00h, 00h, 00h, 00h, 01h, 01h, 01h, 01h
+ BYTE 02h, 02h, 02h, 02h, 03h, 03h, 03h, 03h
+ BYTE 04h, 04h, 04h, 04h, 05h, 05h, 05h, 05h
+ BYTE 06h, 06h, 06h, 06h, 07h, 07h, 07h, 07h
+ptr_L_mlkem_decompress_4_avx2_shuf QWORD L_mlkem_decompress_4_avx2_shuf
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_decompress_4_avx2 PROC
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_4_avx2_mask
+ vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_4_avx2_shift
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_4_avx2_shuf
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_4_avx2_q
+ vpbroadcastq ymm0, QWORD PTR [rdx]
+ vpbroadcastq ymm1, QWORD PTR [rdx+8]
+ vpbroadcastq ymm2, QWORD PTR [rdx+16]
+ vpbroadcastq ymm3, QWORD PTR [rdx+24]
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmullw ymm0, ymm0, ymm5
+ vpmullw ymm1, ymm1, ymm5
+ vpmullw ymm2, ymm2, ymm5
+ vpmullw ymm3, ymm3, ymm5
+ vpmulhrsw ymm0, ymm0, ymm7
+ vpmulhrsw ymm1, ymm1, ymm7
+ vpmulhrsw ymm2, ymm2, ymm7
+ vpmulhrsw ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vpbroadcastq ymm0, QWORD PTR [rdx+32]
+ vpbroadcastq ymm1, QWORD PTR [rdx+40]
+ vpbroadcastq ymm2, QWORD PTR [rdx+48]
+ vpbroadcastq ymm3, QWORD PTR [rdx+56]
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmullw ymm0, ymm0, ymm5
+ vpmullw ymm1, ymm1, ymm5
+ vpmullw ymm2, ymm2, ymm5
+ vpmullw ymm3, ymm3, ymm5
+ vpmulhrsw ymm0, ymm0, ymm7
+ vpmulhrsw ymm1, ymm1, ymm7
+ vpmulhrsw ymm2, ymm2, ymm7
+ vpmulhrsw ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vmovdqu YMMWORD PTR [rcx+160], ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm2
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vpbroadcastq ymm0, QWORD PTR [rdx+64]
+ vpbroadcastq ymm1, QWORD PTR [rdx+72]
+ vpbroadcastq ymm2, QWORD PTR [rdx+80]
+ vpbroadcastq ymm3, QWORD PTR [rdx+88]
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmullw ymm0, ymm0, ymm5
+ vpmullw ymm1, ymm1, ymm5
+ vpmullw ymm2, ymm2, ymm5
+ vpmullw ymm3, ymm3, ymm5
+ vpmulhrsw ymm0, ymm0, ymm7
+ vpmulhrsw ymm1, ymm1, ymm7
+ vpmulhrsw ymm2, ymm2, ymm7
+ vpmulhrsw ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vpbroadcastq ymm0, QWORD PTR [rdx+96]
+ vpbroadcastq ymm1, QWORD PTR [rdx+104]
+ vpbroadcastq ymm2, QWORD PTR [rdx+112]
+ vpbroadcastq ymm3, QWORD PTR [rdx+120]
+ vpshufb ymm0, ymm0, ymm6
+ vpshufb ymm1, ymm1, ymm6
+ vpshufb ymm2, ymm2, ymm6
+ vpshufb ymm3, ymm3, ymm6
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpand ymm2, ymm2, ymm4
+ vpand ymm3, ymm3, ymm4
+ vpmullw ymm0, ymm0, ymm5
+ vpmullw ymm1, ymm1, ymm5
+ vpmullw ymm2, ymm2, ymm5
+ vpmullw ymm3, ymm3, ymm5
+ vpmulhrsw ymm0, ymm0, ymm7
+ vpmulhrsw ymm1, ymm1, ymm7
+ vpmulhrsw ymm2, ymm2, ymm7
+ vpmulhrsw ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vmovdqu YMMWORD PTR [rcx+416], ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm2
+ vmovdqu YMMWORD PTR [rcx+480], ymm3
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+mlkem_decompress_4_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh
+ptr_L_mlkem_compress_5_avx2_v QWORD L_mlkem_compress_5_avx2_v
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_shift WORD 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h
+ WORD 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h
+ptr_L_mlkem_compress_5_avx2_shift QWORD L_mlkem_compress_5_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_mask WORD 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh
+ WORD 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh
+ptr_L_mlkem_compress_5_avx2_mask QWORD L_mlkem_compress_5_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_shift1 WORD 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h
+ WORD 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h
+ptr_L_mlkem_compress_5_avx2_shift1 QWORD L_mlkem_compress_5_avx2_shift1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_shift2 DWORD 04000001h, 04000001h, 04000001h, 04000001h
+ DWORD 04000001h, 04000001h, 04000001h, 04000001h
+ptr_L_mlkem_compress_5_avx2_shift2 QWORD L_mlkem_compress_5_avx2_shift2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_shlv QWORD 000000000000000ch, 000000000000000ch
+ QWORD 000000000000000ch, 000000000000000ch
+ptr_L_mlkem_compress_5_avx2_shlv QWORD L_mlkem_compress_5_avx2_shlv
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_compress_5_avx2_shuffle BYTE 00h, 01h, 02h, 03h, 04h, 0ffh, 0ffh, 0ffh
+ BYTE 0ffh, 0ffh, 08h, 09h, 0ah, 0bh, 0ch, 0ffh
+ BYTE 09h, 0ah, 0bh, 0ch, 0ffh, 00h, 01h, 02h
+ BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 08h
+ptr_L_mlkem_compress_5_avx2_shuffle QWORD L_mlkem_compress_5_avx2_shuffle
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_compress_5_avx2 PROC
+ sub rsp, 48
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm2, YMMWORD PTR L_mlkem_compress_5_avx2_v
+ vmovdqu ymm3, YMMWORD PTR L_mlkem_compress_5_avx2_shift
+ vmovdqu ymm4, YMMWORD PTR L_mlkem_compress_5_avx2_mask
+ vmovdqu ymm5, YMMWORD PTR L_mlkem_compress_5_avx2_shift1
+ vmovdqu ymm6, YMMWORD PTR L_mlkem_compress_5_avx2_shift2
+ vmovdqu ymm7, YMMWORD PTR L_mlkem_compress_5_avx2_shlv
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_5_avx2_shuffle
+ vpmulhw ymm0, ymm2, [rdx]
+ vpmulhw ymm1, ymm2, [rdx+32]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx], xmm0
+ movss DWORD PTR [rcx+16], xmm1
+ vpmulhw ymm0, ymm2, [rdx+64]
+ vpmulhw ymm1, ymm2, [rdx+96]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+20], xmm0
+ movss DWORD PTR [rcx+36], xmm1
+ vpmulhw ymm0, ymm2, [rdx+128]
+ vpmulhw ymm1, ymm2, [rdx+160]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+40], xmm0
+ movss DWORD PTR [rcx+56], xmm1
+ vpmulhw ymm0, ymm2, [rdx+192]
+ vpmulhw ymm1, ymm2, [rdx+224]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+60], xmm0
+ movss DWORD PTR [rcx+76], xmm1
+ vpmulhw ymm0, ymm2, [rdx+256]
+ vpmulhw ymm1, ymm2, [rdx+288]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+80], xmm0
+ movss DWORD PTR [rcx+96], xmm1
+ vpmulhw ymm0, ymm2, [rdx+320]
+ vpmulhw ymm1, ymm2, [rdx+352]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+100], xmm0
+ movss DWORD PTR [rcx+116], xmm1
+ vpmulhw ymm0, ymm2, [rdx+384]
+ vpmulhw ymm1, ymm2, [rdx+416]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+120], xmm0
+ movss DWORD PTR [rcx+136], xmm1
+ vpmulhw ymm0, ymm2, [rdx+448]
+ vpmulhw ymm1, ymm2, [rdx+480]
+ vpmulhrsw ymm0, ymm0, ymm3
+ vpmulhrsw ymm1, ymm1, ymm3
+ vpand ymm0, ymm0, ymm4
+ vpand ymm1, ymm1, ymm4
+ vpackuswb ymm0, ymm0, ymm1
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddwd ymm0, ymm0, ymm6
+ vpsllvd ymm0, ymm0, ymm7
+ vpsrlvq ymm0, ymm0, ymm7
+ vpshufb ymm0, ymm0, ymm8
+ vextracti128 xmm1, ymm0, 1
+ vpblendvb xmm0, xmm0, xmm1, xmm8
+ vmovdqu OWORD PTR [rcx+140], xmm0
+ movss DWORD PTR [rcx+156], xmm1
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ ret
+mlkem_compress_5_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_5_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h
+ptr_L_mlkem_decompress_5_avx2_q QWORD L_mlkem_decompress_5_avx2_q
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_5_avx2_shuf BYTE 00h, 00h, 00h, 01h, 01h, 01h, 01h, 02h
+ BYTE 02h, 03h, 03h, 03h, 03h, 04h, 04h, 04h
+ BYTE 05h, 05h, 05h, 06h, 06h, 06h, 06h, 07h
+ BYTE 07h, 08h, 08h, 08h, 08h, 09h, 09h, 09h
+ptr_L_mlkem_decompress_5_avx2_shuf QWORD L_mlkem_decompress_5_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_5_avx2_mask WORD 001fh, 03e0h, 007ch, 0f80h, 01f0h, 003eh, 07c0h, 00f8h
+ WORD 001fh, 03e0h, 007ch, 0f80h, 01f0h, 003eh, 07c0h, 00f8h
+ptr_L_mlkem_decompress_5_avx2_mask QWORD L_mlkem_decompress_5_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_decompress_5_avx2_shift WORD 0400h, 0020h, 0100h, 0008h, 0040h, 0200h, 0010h, 0080h
+ WORD 0400h, 0020h, 0100h, 0008h, 0040h, 0200h, 0010h, 0080h
+ptr_L_mlkem_decompress_5_avx2_shift QWORD L_mlkem_decompress_5_avx2_shift
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_decompress_5_avx2 PROC
+ vmovdqu ymm1, YMMWORD PTR L_mlkem_decompress_5_avx2_q
+ vmovdqu ymm2, YMMWORD PTR L_mlkem_decompress_5_avx2_shuf
+ vmovdqu ymm3, YMMWORD PTR L_mlkem_decompress_5_avx2_mask
+ vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_5_avx2_shift
+ vbroadcasti128 ymm0, OWORD PTR [rdx]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+10]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+20]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+30]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+40]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+128], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+50]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+60]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+192], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+70]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+80]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+90]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+288], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+100]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+110]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+352], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+120]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+384], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+130]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+416], ymm0
+ vbroadcasti128 ymm0, OWORD PTR [rdx+140]
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+448], ymm0
+ vmovq xmm0, QWORD PTR [rdx+150]
+ movzx rax, WORD PTR [rdx+158]
+ vpinsrq xmm0, xmm0, rax, 1
+ vinserti128 ymm0, ymm0, xmm0, 1
+ vpshufb ymm0, ymm0, ymm2
+ vpand ymm0, ymm0, ymm3
+ vpmullw ymm0, ymm0, ymm4
+ vpmulhrsw ymm0, ymm0, ymm1
+ vmovdqu YMMWORD PTR [rcx+480], ymm0
+ vzeroupper
+ ret
+mlkem_decompress_5_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_from_msg_avx2_shift DWORD 00000003h, 00000002h, 00000001h, 00000000h
+ DWORD 00000003h, 00000002h, 00000001h, 00000000h
+ptr_L_mlkem_from_msg_avx2_shift QWORD L_mlkem_from_msg_avx2_shift
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_from_msg_avx2_shuf BYTE 00h, 01h, 04h, 05h, 08h, 09h, 0ch, 0dh
+ BYTE 02h, 03h, 06h, 07h, 0ah, 0bh, 0eh, 0fh
+ BYTE 00h, 01h, 04h, 05h, 08h, 09h, 0ch, 0dh
+ BYTE 02h, 03h, 06h, 07h, 0ah, 0bh, 0eh, 0fh
+ptr_L_mlkem_from_msg_avx2_shuf QWORD L_mlkem_from_msg_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_from_msg_avx2_hqs WORD 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h
+ WORD 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h
+ptr_L_mlkem_from_msg_avx2_hqs QWORD L_mlkem_from_msg_avx2_hqs
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_from_msg_avx2 PROC
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_from_msg_avx2_shift
+ vmovdqu ymm10, YMMWORD PTR L_mlkem_from_msg_avx2_shuf
+ vmovdqu ymm11, YMMWORD PTR L_mlkem_from_msg_avx2_hqs
+ vpshufd ymm4, ymm0, 0
+ vpsllvd ymm4, ymm4, ymm9
+ vpshufb ymm4, ymm4, ymm10
+ vpsllw ymm1, ymm4, 12
+ vpsllw ymm2, ymm4, 8
+ vpsllw ymm3, ymm4, 4
+ vpsraw ymm1, ymm1, 15
+ vpsraw ymm2, ymm2, 15
+ vpsraw ymm3, ymm3, 15
+ vpsraw ymm4, ymm4, 15
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm11
+ vpunpcklqdq ymm5, ymm1, ymm2
+ vpunpckhqdq ymm7, ymm1, ymm2
+ vpunpcklqdq ymm6, ymm3, ymm4
+ vpunpckhqdq ymm8, ymm3, ymm4
+ vperm2i128 ymm1, ymm5, ymm6, 32
+ vperm2i128 ymm3, ymm5, ymm6, 49
+ vperm2i128 ymm2, ymm7, ymm8, 32
+ vperm2i128 ymm4, ymm7, ymm8, 49
+ vmovdqu YMMWORD PTR [rcx], ymm1
+ vmovdqu YMMWORD PTR [rcx+32], ymm2
+ vmovdqu YMMWORD PTR [rcx+256], ymm3
+ vmovdqu YMMWORD PTR [rcx+288], ymm4
+ vpshufd ymm4, ymm0, 85
+ vpsllvd ymm4, ymm4, ymm9
+ vpshufb ymm4, ymm4, ymm10
+ vpsllw ymm1, ymm4, 12
+ vpsllw ymm2, ymm4, 8
+ vpsllw ymm3, ymm4, 4
+ vpsraw ymm1, ymm1, 15
+ vpsraw ymm2, ymm2, 15
+ vpsraw ymm3, ymm3, 15
+ vpsraw ymm4, ymm4, 15
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm11
+ vpunpcklqdq ymm5, ymm1, ymm2
+ vpunpckhqdq ymm7, ymm1, ymm2
+ vpunpcklqdq ymm6, ymm3, ymm4
+ vpunpckhqdq ymm8, ymm3, ymm4
+ vperm2i128 ymm1, ymm5, ymm6, 32
+ vperm2i128 ymm3, ymm5, ymm6, 49
+ vperm2i128 ymm2, ymm7, ymm8, 32
+ vperm2i128 ymm4, ymm7, ymm8, 49
+ vmovdqu YMMWORD PTR [rcx+64], ymm1
+ vmovdqu YMMWORD PTR [rcx+96], ymm2
+ vmovdqu YMMWORD PTR [rcx+320], ymm3
+ vmovdqu YMMWORD PTR [rcx+352], ymm4
+ vpshufd ymm4, ymm0, 170
+ vpsllvd ymm4, ymm4, ymm9
+ vpshufb ymm4, ymm4, ymm10
+ vpsllw ymm1, ymm4, 12
+ vpsllw ymm2, ymm4, 8
+ vpsllw ymm3, ymm4, 4
+ vpsraw ymm1, ymm1, 15
+ vpsraw ymm2, ymm2, 15
+ vpsraw ymm3, ymm3, 15
+ vpsraw ymm4, ymm4, 15
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm11
+ vpunpcklqdq ymm5, ymm1, ymm2
+ vpunpckhqdq ymm7, ymm1, ymm2
+ vpunpcklqdq ymm6, ymm3, ymm4
+ vpunpckhqdq ymm8, ymm3, ymm4
+ vperm2i128 ymm1, ymm5, ymm6, 32
+ vperm2i128 ymm3, ymm5, ymm6, 49
+ vperm2i128 ymm2, ymm7, ymm8, 32
+ vperm2i128 ymm4, ymm7, ymm8, 49
+ vmovdqu YMMWORD PTR [rcx+128], ymm1
+ vmovdqu YMMWORD PTR [rcx+160], ymm2
+ vmovdqu YMMWORD PTR [rcx+384], ymm3
+ vmovdqu YMMWORD PTR [rcx+416], ymm4
+ vpshufd ymm4, ymm0, 255
+ vpsllvd ymm4, ymm4, ymm9
+ vpshufb ymm4, ymm4, ymm10
+ vpsllw ymm1, ymm4, 12
+ vpsllw ymm2, ymm4, 8
+ vpsllw ymm3, ymm4, 4
+ vpsraw ymm1, ymm1, 15
+ vpsraw ymm2, ymm2, 15
+ vpsraw ymm3, ymm3, 15
+ vpsraw ymm4, ymm4, 15
+ vpand ymm1, ymm1, ymm11
+ vpand ymm2, ymm2, ymm11
+ vpand ymm3, ymm3, ymm11
+ vpand ymm4, ymm4, ymm11
+ vpunpcklqdq ymm5, ymm1, ymm2
+ vpunpckhqdq ymm7, ymm1, ymm2
+ vpunpcklqdq ymm6, ymm3, ymm4
+ vpunpckhqdq ymm8, ymm3, ymm4
+ vperm2i128 ymm1, ymm5, ymm6, 32
+ vperm2i128 ymm3, ymm5, ymm6, 49
+ vperm2i128 ymm2, ymm7, ymm8, 32
+ vperm2i128 ymm4, ymm7, ymm8, 49
+ vmovdqu YMMWORD PTR [rcx+192], ymm1
+ vmovdqu YMMWORD PTR [rcx+224], ymm2
+ vmovdqu YMMWORD PTR [rcx+448], ymm3
+ vmovdqu YMMWORD PTR [rcx+480], ymm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ ret
+mlkem_from_msg_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_to_msg_avx2_hqs WORD 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h
+ WORD 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h
+ptr_L_mlkem_to_msg_avx2_hqs QWORD L_mlkem_to_msg_avx2_hqs
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_to_msg_avx2_hhqs WORD 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h
+ WORD 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h
+ptr_L_mlkem_to_msg_avx2_hhqs QWORD L_mlkem_to_msg_avx2_hhqs
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_to_msg_avx2 PROC
+ sub rsp, 64
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu ymm8, YMMWORD PTR L_mlkem_to_msg_avx2_hqs
+ vmovdqu ymm9, YMMWORD PTR L_mlkem_to_msg_avx2_hhqs
+ vpsubw ymm0, ymm8, [rdx]
+ vpsubw ymm1, ymm8, [rdx+32]
+ vpsubw ymm2, ymm8, [rdx+64]
+ vpsubw ymm3, ymm8, [rdx+96]
+ vpsraw ymm4, ymm0, 15
+ vpsraw ymm5, ymm1, 15
+ vpsraw ymm6, ymm2, 15
+ vpsraw ymm7, ymm3, 15
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpaddw ymm0, ymm0, ymm9
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm9
+ vpaddw ymm3, ymm3, ymm9
+ vpacksswb ymm0, ymm0, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm2, ymm2, 216
+ vpmovmskb eax, ymm0
+ vpmovmskb r8d, ymm2
+ mov DWORD PTR [rcx], eax
+ mov DWORD PTR [rcx+4], r8d
+ vpsubw ymm0, ymm8, [rdx+128]
+ vpsubw ymm1, ymm8, [rdx+160]
+ vpsubw ymm2, ymm8, [rdx+192]
+ vpsubw ymm3, ymm8, [rdx+224]
+ vpsraw ymm4, ymm0, 15
+ vpsraw ymm5, ymm1, 15
+ vpsraw ymm6, ymm2, 15
+ vpsraw ymm7, ymm3, 15
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpaddw ymm0, ymm0, ymm9
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm9
+ vpaddw ymm3, ymm3, ymm9
+ vpacksswb ymm0, ymm0, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm2, ymm2, 216
+ vpmovmskb eax, ymm0
+ vpmovmskb r8d, ymm2
+ mov DWORD PTR [rcx+8], eax
+ mov DWORD PTR [rcx+12], r8d
+ vpsubw ymm0, ymm8, [rdx+256]
+ vpsubw ymm1, ymm8, [rdx+288]
+ vpsubw ymm2, ymm8, [rdx+320]
+ vpsubw ymm3, ymm8, [rdx+352]
+ vpsraw ymm4, ymm0, 15
+ vpsraw ymm5, ymm1, 15
+ vpsraw ymm6, ymm2, 15
+ vpsraw ymm7, ymm3, 15
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpaddw ymm0, ymm0, ymm9
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm9
+ vpaddw ymm3, ymm3, ymm9
+ vpacksswb ymm0, ymm0, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm2, ymm2, 216
+ vpmovmskb eax, ymm0
+ vpmovmskb r8d, ymm2
+ mov DWORD PTR [rcx+16], eax
+ mov DWORD PTR [rcx+20], r8d
+ vpsubw ymm0, ymm8, [rdx+384]
+ vpsubw ymm1, ymm8, [rdx+416]
+ vpsubw ymm2, ymm8, [rdx+448]
+ vpsubw ymm3, ymm8, [rdx+480]
+ vpsraw ymm4, ymm0, 15
+ vpsraw ymm5, ymm1, 15
+ vpsraw ymm6, ymm2, 15
+ vpsraw ymm7, ymm3, 15
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm3, ymm3, ymm7
+ vpaddw ymm0, ymm0, ymm9
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm9
+ vpaddw ymm3, ymm3, ymm9
+ vpacksswb ymm0, ymm0, ymm1
+ vpacksswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 216
+ vpermq ymm2, ymm2, 216
+ vpmovmskb eax, ymm0
+ vpmovmskb r8d, ymm2
+ mov DWORD PTR [rcx+24], eax
+ mov DWORD PTR [rcx+28], r8d
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ add rsp, 64
+ ret
+mlkem_to_msg_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_from_bytes_avx2_shuf BYTE 00h, 01h, 02h, 0ffh, 03h, 04h, 05h, 0ffh
+ BYTE 06h, 07h, 08h, 0ffh, 09h, 0ah, 0bh, 0ffh
+ BYTE 04h, 05h, 06h, 0ffh, 07h, 08h, 09h, 0ffh
+ BYTE 0ah, 0bh, 0ch, 0ffh, 0dh, 0eh, 0fh, 0ffh
+ptr_L_mlkem_from_bytes_avx2_shuf QWORD L_mlkem_from_bytes_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_from_bytes_avx2_mask DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ptr_L_mlkem_from_bytes_avx2_mask QWORD L_mlkem_from_bytes_avx2_mask
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_from_bytes_avx2 PROC
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm12, YMMWORD PTR L_mlkem_from_bytes_avx2_shuf
+ vmovdqu ymm13, YMMWORD PTR L_mlkem_from_bytes_avx2_mask
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+128]
+ vmovdqu ymm5, YMMWORD PTR [rdx+160]
+ vpermq ymm7, ymm5, 233
+ vpermq ymm8, ymm5, 0
+ vpermq ymm6, ymm4, 62
+ vpermq ymm9, ymm4, 64
+ vpermq ymm5, ymm3, 3
+ vpermq ymm4, ymm3, 148
+ vpermq ymm3, ymm2, 233
+ vpermq ymm10, ymm2, 0
+ vpermq ymm2, ymm1, 62
+ vpermq ymm11, ymm1, 64
+ vpermq ymm1, ymm0, 3
+ vpermq ymm0, ymm0, 148
+ vpblendd ymm6, ymm6, ymm8, 192
+ vpblendd ymm5, ymm5, ymm9, 252
+ vpblendd ymm2, ymm2, ymm10, 192
+ vpblendd ymm1, ymm1, ymm11, 252
+ vpshufb ymm0, ymm0, ymm12
+ vpshufb ymm1, ymm1, ymm12
+ vpshufb ymm2, ymm2, ymm12
+ vpshufb ymm3, ymm3, ymm12
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpshufb ymm6, ymm6, ymm12
+ vpshufb ymm7, ymm7, ymm12
+ vpandn ymm8, ymm13, ymm0
+ vpandn ymm9, ymm13, ymm1
+ vpandn ymm10, ymm13, ymm2
+ vpandn ymm11, ymm13, ymm3
+ vpand ymm0, ymm13, ymm0
+ vpand ymm1, ymm13, ymm1
+ vpand ymm2, ymm13, ymm2
+ vpand ymm3, ymm13, ymm3
+ vpslld ymm8, ymm8, 4
+ vpslld ymm9, ymm9, 4
+ vpslld ymm10, ymm10, 4
+ vpslld ymm11, ymm11, 4
+ vpor ymm0, ymm0, ymm8
+ vpor ymm1, ymm1, ymm9
+ vpor ymm2, ymm2, ymm10
+ vpor ymm3, ymm3, ymm11
+ vpandn ymm8, ymm13, ymm4
+ vpandn ymm9, ymm13, ymm5
+ vpandn ymm10, ymm13, ymm6
+ vpandn ymm11, ymm13, ymm7
+ vpand ymm4, ymm13, ymm4
+ vpand ymm5, ymm13, ymm5
+ vpand ymm6, ymm13, ymm6
+ vpand ymm7, ymm13, ymm7
+ vpslld ymm8, ymm8, 4
+ vpslld ymm9, ymm9, 4
+ vpslld ymm10, ymm10, 4
+ vpslld ymm11, ymm11, 4
+ vpor ymm4, ymm4, ymm8
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm10
+ vpor ymm7, ymm7, ymm11
+ vmovdqu YMMWORD PTR [rcx], ymm0
+ vmovdqu YMMWORD PTR [rcx+32], ymm1
+ vmovdqu YMMWORD PTR [rcx+64], ymm2
+ vmovdqu YMMWORD PTR [rcx+96], ymm3
+ vmovdqu YMMWORD PTR [rcx+128], ymm4
+ vmovdqu YMMWORD PTR [rcx+160], ymm5
+ vmovdqu YMMWORD PTR [rcx+192], ymm6
+ vmovdqu YMMWORD PTR [rcx+224], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx+192]
+ vmovdqu ymm1, YMMWORD PTR [rdx+224]
+ vmovdqu ymm2, YMMWORD PTR [rdx+256]
+ vmovdqu ymm3, YMMWORD PTR [rdx+288]
+ vmovdqu ymm4, YMMWORD PTR [rdx+320]
+ vmovdqu ymm5, YMMWORD PTR [rdx+352]
+ vpermq ymm7, ymm5, 233
+ vpermq ymm8, ymm5, 0
+ vpermq ymm6, ymm4, 62
+ vpermq ymm9, ymm4, 64
+ vpermq ymm5, ymm3, 3
+ vpermq ymm4, ymm3, 148
+ vpermq ymm3, ymm2, 233
+ vpermq ymm10, ymm2, 0
+ vpermq ymm2, ymm1, 62
+ vpermq ymm11, ymm1, 64
+ vpermq ymm1, ymm0, 3
+ vpermq ymm0, ymm0, 148
+ vpblendd ymm6, ymm6, ymm8, 192
+ vpblendd ymm5, ymm5, ymm9, 252
+ vpblendd ymm2, ymm2, ymm10, 192
+ vpblendd ymm1, ymm1, ymm11, 252
+ vpshufb ymm0, ymm0, ymm12
+ vpshufb ymm1, ymm1, ymm12
+ vpshufb ymm2, ymm2, ymm12
+ vpshufb ymm3, ymm3, ymm12
+ vpshufb ymm4, ymm4, ymm12
+ vpshufb ymm5, ymm5, ymm12
+ vpshufb ymm6, ymm6, ymm12
+ vpshufb ymm7, ymm7, ymm12
+ vpandn ymm8, ymm13, ymm0
+ vpandn ymm9, ymm13, ymm1
+ vpandn ymm10, ymm13, ymm2
+ vpandn ymm11, ymm13, ymm3
+ vpand ymm0, ymm13, ymm0
+ vpand ymm1, ymm13, ymm1
+ vpand ymm2, ymm13, ymm2
+ vpand ymm3, ymm13, ymm3
+ vpslld ymm8, ymm8, 4
+ vpslld ymm9, ymm9, 4
+ vpslld ymm10, ymm10, 4
+ vpslld ymm11, ymm11, 4
+ vpor ymm0, ymm0, ymm8
+ vpor ymm1, ymm1, ymm9
+ vpor ymm2, ymm2, ymm10
+ vpor ymm3, ymm3, ymm11
+ vpandn ymm8, ymm13, ymm4
+ vpandn ymm9, ymm13, ymm5
+ vpandn ymm10, ymm13, ymm6
+ vpandn ymm11, ymm13, ymm7
+ vpand ymm4, ymm13, ymm4
+ vpand ymm5, ymm13, ymm5
+ vpand ymm6, ymm13, ymm6
+ vpand ymm7, ymm13, ymm7
+ vpslld ymm8, ymm8, 4
+ vpslld ymm9, ymm9, 4
+ vpslld ymm10, ymm10, 4
+ vpslld ymm11, ymm11, 4
+ vpor ymm4, ymm4, ymm8
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm10
+ vpor ymm7, ymm7, ymm11
+ vmovdqu YMMWORD PTR [rcx+256], ymm0
+ vmovdqu YMMWORD PTR [rcx+288], ymm1
+ vmovdqu YMMWORD PTR [rcx+320], ymm2
+ vmovdqu YMMWORD PTR [rcx+352], ymm3
+ vmovdqu YMMWORD PTR [rcx+384], ymm4
+ vmovdqu YMMWORD PTR [rcx+416], ymm5
+ vmovdqu YMMWORD PTR [rcx+448], ymm6
+ vmovdqu YMMWORD PTR [rcx+480], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ ret
+mlkem_from_bytes_avx2 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_to_bytes_avx2_mask DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh
+ptr_L_mlkem_to_bytes_avx2_mask QWORD L_mlkem_to_bytes_avx2_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_to_bytes_avx2_shuf BYTE 00h, 01h, 02h, 04h, 05h, 06h, 08h, 09h
+ BYTE 0ah, 0ch, 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh
+ BYTE 05h, 06h, 08h, 09h, 0ah, 0ch, 0dh, 0eh
+ BYTE 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h, 02h, 04h
+ptr_L_mlkem_to_bytes_avx2_shuf QWORD L_mlkem_to_bytes_avx2_shuf
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_mlkem_to_bytes_avx2_perm DWORD 00000000h, 00000001h, 00000002h, 00000007h
+ DWORD 00000004h, 00000005h, 00000003h, 00000006h
+ptr_L_mlkem_to_bytes_avx2_perm QWORD L_mlkem_to_bytes_avx2_perm
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_to_bytes_avx2 PROC
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm12, YMMWORD PTR mlkem_q
+ vmovdqu ymm13, YMMWORD PTR L_mlkem_to_bytes_avx2_mask
+ vmovdqu ymm14, YMMWORD PTR L_mlkem_to_bytes_avx2_shuf
+ vmovdqu ymm15, YMMWORD PTR L_mlkem_to_bytes_avx2_perm
+ vmovdqu ymm0, YMMWORD PTR [rdx]
+ vmovdqu ymm1, YMMWORD PTR [rdx+32]
+ vmovdqu ymm2, YMMWORD PTR [rdx+64]
+ vmovdqu ymm3, YMMWORD PTR [rdx+96]
+ vmovdqu ymm4, YMMWORD PTR [rdx+128]
+ vmovdqu ymm5, YMMWORD PTR [rdx+160]
+ vmovdqu ymm6, YMMWORD PTR [rdx+192]
+ vmovdqu ymm7, YMMWORD PTR [rdx+224]
+ vpsubw ymm8, ymm0, ymm12
+ vpsubw ymm9, ymm1, ymm12
+ vpsubw ymm10, ymm2, ymm12
+ vpsubw ymm11, ymm3, ymm12
+ vpsraw ymm0, ymm8, 15
+ vpsraw ymm1, ymm9, 15
+ vpsraw ymm2, ymm10, 15
+ vpsraw ymm3, ymm11, 15
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpand ymm2, ymm2, ymm12
+ vpand ymm3, ymm3, ymm12
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm10
+ vpaddw ymm3, ymm3, ymm11
+ vpsubw ymm8, ymm4, ymm12
+ vpsubw ymm9, ymm5, ymm12
+ vpsubw ymm10, ymm6, ymm12
+ vpsubw ymm11, ymm7, ymm12
+ vpsraw ymm4, ymm8, 15
+ vpsraw ymm5, ymm9, 15
+ vpsraw ymm6, ymm10, 15
+ vpsraw ymm7, ymm11, 15
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vpand ymm6, ymm6, ymm12
+ vpand ymm7, ymm7, ymm12
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpaddw ymm6, ymm6, ymm10
+ vpaddw ymm7, ymm7, ymm11
+ vpsrld ymm8, ymm0, 16
+ vpsrld ymm9, ymm1, 16
+ vpsrld ymm10, ymm2, 16
+ vpsrld ymm11, ymm3, 16
+ vpand ymm0, ymm13, ymm0
+ vpand ymm1, ymm13, ymm1
+ vpand ymm2, ymm13, ymm2
+ vpand ymm3, ymm13, ymm3
+ vpslld ymm8, ymm8, 12
+ vpslld ymm9, ymm9, 12
+ vpslld ymm10, ymm10, 12
+ vpslld ymm11, ymm11, 12
+ vpor ymm0, ymm0, ymm8
+ vpor ymm1, ymm1, ymm9
+ vpor ymm2, ymm2, ymm10
+ vpor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm4, 16
+ vpsrld ymm9, ymm5, 16
+ vpsrld ymm10, ymm6, 16
+ vpsrld ymm11, ymm7, 16
+ vpand ymm4, ymm13, ymm4
+ vpand ymm5, ymm13, ymm5
+ vpand ymm6, ymm13, ymm6
+ vpand ymm7, ymm13, ymm7
+ vpslld ymm8, ymm8, 12
+ vpslld ymm9, ymm9, 12
+ vpslld ymm10, ymm10, 12
+ vpslld ymm11, ymm11, 12
+ vpor ymm4, ymm4, ymm8
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm10
+ vpor ymm7, ymm7, ymm11
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm1, ymm1, ymm14
+ vpshufb ymm2, ymm2, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vpshufb ymm4, ymm4, ymm14
+ vpshufb ymm5, ymm5, ymm14
+ vpshufb ymm6, ymm6, ymm14
+ vpshufb ymm7, ymm7, ymm14
+ vpermd ymm0, ymm15, ymm0
+ vpermd ymm1, ymm15, ymm1
+ vpermd ymm2, ymm15, ymm2
+ vpermd ymm3, ymm15, ymm3
+ vpermd ymm4, ymm15, ymm4
+ vpermd ymm5, ymm15, ymm5
+ vpermd ymm6, ymm15, ymm6
+ vpermd ymm7, ymm15, ymm7
+ vpermq ymm8, ymm6, 2
+ vpermq ymm7, ymm7, 144
+ vpermq ymm9, ymm5, 9
+ vpermq ymm6, ymm6, 64
+ vpermq ymm5, ymm5, 0
+ vpblendd ymm5, ymm5, ymm4, 63
+ vpermq ymm10, ymm2, 2
+ vpermq ymm4, ymm3, 144
+ vpermq ymm11, ymm1, 9
+ vpermq ymm3, ymm2, 64
+ vpermq ymm2, ymm1, 0
+ vpblendd ymm2, ymm2, ymm0, 63
+ vpblendd ymm7, ymm7, ymm8, 3
+ vpblendd ymm6, ymm6, ymm9, 15
+ vpblendd ymm4, ymm4, ymm10, 3
+ vpblendd ymm3, ymm3, ymm11, 15
+ vmovdqu YMMWORD PTR [rcx], ymm2
+ vmovdqu YMMWORD PTR [rcx+32], ymm3
+ vmovdqu YMMWORD PTR [rcx+64], ymm4
+ vmovdqu YMMWORD PTR [rcx+96], ymm5
+ vmovdqu YMMWORD PTR [rcx+128], ymm6
+ vmovdqu YMMWORD PTR [rcx+160], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rdx+256]
+ vmovdqu ymm1, YMMWORD PTR [rdx+288]
+ vmovdqu ymm2, YMMWORD PTR [rdx+320]
+ vmovdqu ymm3, YMMWORD PTR [rdx+352]
+ vmovdqu ymm4, YMMWORD PTR [rdx+384]
+ vmovdqu ymm5, YMMWORD PTR [rdx+416]
+ vmovdqu ymm6, YMMWORD PTR [rdx+448]
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
+ vpsubw ymm8, ymm0, ymm12
+ vpsubw ymm9, ymm1, ymm12
+ vpsubw ymm10, ymm2, ymm12
+ vpsubw ymm11, ymm3, ymm12
+ vpsraw ymm0, ymm8, 15
+ vpsraw ymm1, ymm9, 15
+ vpsraw ymm2, ymm10, 15
+ vpsraw ymm3, ymm11, 15
+ vpand ymm0, ymm0, ymm12
+ vpand ymm1, ymm1, ymm12
+ vpand ymm2, ymm2, ymm12
+ vpand ymm3, ymm3, ymm12
+ vpaddw ymm0, ymm0, ymm8
+ vpaddw ymm1, ymm1, ymm9
+ vpaddw ymm2, ymm2, ymm10
+ vpaddw ymm3, ymm3, ymm11
+ vpsubw ymm8, ymm4, ymm12
+ vpsubw ymm9, ymm5, ymm12
+ vpsubw ymm10, ymm6, ymm12
+ vpsubw ymm11, ymm7, ymm12
+ vpsraw ymm4, ymm8, 15
+ vpsraw ymm5, ymm9, 15
+ vpsraw ymm6, ymm10, 15
+ vpsraw ymm7, ymm11, 15
+ vpand ymm4, ymm4, ymm12
+ vpand ymm5, ymm5, ymm12
+ vpand ymm6, ymm6, ymm12
+ vpand ymm7, ymm7, ymm12
+ vpaddw ymm4, ymm4, ymm8
+ vpaddw ymm5, ymm5, ymm9
+ vpaddw ymm6, ymm6, ymm10
+ vpaddw ymm7, ymm7, ymm11
+ vpsrld ymm8, ymm0, 16
+ vpsrld ymm9, ymm1, 16
+ vpsrld ymm10, ymm2, 16
+ vpsrld ymm11, ymm3, 16
+ vpand ymm0, ymm13, ymm0
+ vpand ymm1, ymm13, ymm1
+ vpand ymm2, ymm13, ymm2
+ vpand ymm3, ymm13, ymm3
+ vpslld ymm8, ymm8, 12
+ vpslld ymm9, ymm9, 12
+ vpslld ymm10, ymm10, 12
+ vpslld ymm11, ymm11, 12
+ vpor ymm0, ymm0, ymm8
+ vpor ymm1, ymm1, ymm9
+ vpor ymm2, ymm2, ymm10
+ vpor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm4, 16
+ vpsrld ymm9, ymm5, 16
+ vpsrld ymm10, ymm6, 16
+ vpsrld ymm11, ymm7, 16
+ vpand ymm4, ymm13, ymm4
+ vpand ymm5, ymm13, ymm5
+ vpand ymm6, ymm13, ymm6
+ vpand ymm7, ymm13, ymm7
+ vpslld ymm8, ymm8, 12
+ vpslld ymm9, ymm9, 12
+ vpslld ymm10, ymm10, 12
+ vpslld ymm11, ymm11, 12
+ vpor ymm4, ymm4, ymm8
+ vpor ymm5, ymm5, ymm9
+ vpor ymm6, ymm6, ymm10
+ vpor ymm7, ymm7, ymm11
+ vpshufb ymm0, ymm0, ymm14
+ vpshufb ymm1, ymm1, ymm14
+ vpshufb ymm2, ymm2, ymm14
+ vpshufb ymm3, ymm3, ymm14
+ vpshufb ymm4, ymm4, ymm14
+ vpshufb ymm5, ymm5, ymm14
+ vpshufb ymm6, ymm6, ymm14
+ vpshufb ymm7, ymm7, ymm14
+ vpermd ymm0, ymm15, ymm0
+ vpermd ymm1, ymm15, ymm1
+ vpermd ymm2, ymm15, ymm2
+ vpermd ymm3, ymm15, ymm3
+ vpermd ymm4, ymm15, ymm4
+ vpermd ymm5, ymm15, ymm5
+ vpermd ymm6, ymm15, ymm6
+ vpermd ymm7, ymm15, ymm7
+ vpermq ymm8, ymm6, 2
+ vpermq ymm7, ymm7, 144
+ vpermq ymm9, ymm5, 9
+ vpermq ymm6, ymm6, 64
+ vpermq ymm5, ymm5, 0
+ vpblendd ymm5, ymm5, ymm4, 63
+ vpermq ymm10, ymm2, 2
+ vpermq ymm4, ymm3, 144
+ vpermq ymm11, ymm1, 9
+ vpermq ymm3, ymm2, 64
+ vpermq ymm2, ymm1, 0
+ vpblendd ymm2, ymm2, ymm0, 63
+ vpblendd ymm7, ymm7, ymm8, 3
+ vpblendd ymm6, ymm6, ymm9, 15
+ vpblendd ymm4, ymm4, ymm10, 3
+ vpblendd ymm3, ymm3, ymm11, 15
+ vmovdqu YMMWORD PTR [rcx+192], ymm2
+ vmovdqu YMMWORD PTR [rcx+224], ymm3
+ vmovdqu YMMWORD PTR [rcx+256], ymm4
+ vmovdqu YMMWORD PTR [rcx+288], ymm5
+ vmovdqu YMMWORD PTR [rcx+320], ymm6
+ vmovdqu YMMWORD PTR [rcx+352], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+mlkem_to_bytes_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_cmp_avx2 PROC
+ vpxor ymm2, ymm2, ymm2
+ vpxor ymm3, ymm3, ymm3
+ mov r9d, 0
+ mov r10d, -1
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vpxor ymm0, ymm0, [rdx]
+ vpxor ymm1, ymm1, [rdx+32]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+64]
+ vmovdqu ymm1, YMMWORD PTR [rcx+96]
+ vpxor ymm0, ymm0, [rdx+64]
+ vpxor ymm1, ymm1, [rdx+96]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+128]
+ vmovdqu ymm1, YMMWORD PTR [rcx+160]
+ vpxor ymm0, ymm0, [rdx+128]
+ vpxor ymm1, ymm1, [rdx+160]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+192]
+ vmovdqu ymm1, YMMWORD PTR [rcx+224]
+ vpxor ymm0, ymm0, [rdx+192]
+ vpxor ymm1, ymm1, [rdx+224]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vpxor ymm0, ymm0, [rdx+256]
+ vpxor ymm1, ymm1, [rdx+288]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+320]
+ vmovdqu ymm1, YMMWORD PTR [rcx+352]
+ vpxor ymm0, ymm0, [rdx+320]
+ vpxor ymm1, ymm1, [rdx+352]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vpxor ymm0, ymm0, [rdx+384]
+ vpxor ymm1, ymm1, [rdx+416]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+448]
+ vmovdqu ymm1, YMMWORD PTR [rcx+480]
+ vpxor ymm0, ymm0, [rdx+448]
+ vpxor ymm1, ymm1, [rdx+480]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+512]
+ vmovdqu ymm1, YMMWORD PTR [rcx+544]
+ vpxor ymm0, ymm0, [rdx+512]
+ vpxor ymm1, ymm1, [rdx+544]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+576]
+ vmovdqu ymm1, YMMWORD PTR [rcx+608]
+ vpxor ymm0, ymm0, [rdx+576]
+ vpxor ymm1, ymm1, [rdx+608]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+640]
+ vmovdqu ymm1, YMMWORD PTR [rcx+672]
+ vpxor ymm0, ymm0, [rdx+640]
+ vpxor ymm1, ymm1, [rdx+672]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+704]
+ vmovdqu ymm1, YMMWORD PTR [rcx+736]
+ vpxor ymm0, ymm0, [rdx+704]
+ vpxor ymm1, ymm1, [rdx+736]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ sub r8d, 768
+ jz L_mlkem_cmp_avx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+768]
+ vmovdqu ymm1, YMMWORD PTR [rcx+800]
+ vpxor ymm0, ymm0, [rdx+768]
+ vpxor ymm1, ymm1, [rdx+800]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+832]
+ vmovdqu ymm1, YMMWORD PTR [rcx+864]
+ vpxor ymm0, ymm0, [rdx+832]
+ vpxor ymm1, ymm1, [rdx+864]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+896]
+ vmovdqu ymm1, YMMWORD PTR [rcx+928]
+ vpxor ymm0, ymm0, [rdx+896]
+ vpxor ymm1, ymm1, [rdx+928]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+960]
+ vmovdqu ymm1, YMMWORD PTR [rcx+992]
+ vpxor ymm0, ymm0, [rdx+960]
+ vpxor ymm1, ymm1, [rdx+992]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1024]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1056]
+ vpxor ymm0, ymm0, [rdx+1024]
+ vpxor ymm1, ymm1, [rdx+1056]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ sub r8d, 320
+ jz L_mlkem_cmp_avx2_done
+ vmovdqu ymm0, YMMWORD PTR [rcx+1088]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1120]
+ vpxor ymm0, ymm0, [rdx+1088]
+ vpxor ymm1, ymm1, [rdx+1120]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1152]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1184]
+ vpxor ymm0, ymm0, [rdx+1152]
+ vpxor ymm1, ymm1, [rdx+1184]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1216]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1248]
+ vpxor ymm0, ymm0, [rdx+1216]
+ vpxor ymm1, ymm1, [rdx+1248]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1280]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1312]
+ vpxor ymm0, ymm0, [rdx+1280]
+ vpxor ymm1, ymm1, [rdx+1312]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1344]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1376]
+ vpxor ymm0, ymm0, [rdx+1344]
+ vpxor ymm1, ymm1, [rdx+1376]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1408]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1440]
+ vpxor ymm0, ymm0, [rdx+1408]
+ vpxor ymm1, ymm1, [rdx+1440]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1472]
+ vmovdqu ymm1, YMMWORD PTR [rcx+1504]
+ vpxor ymm0, ymm0, [rdx+1472]
+ vpxor ymm1, ymm1, [rdx+1504]
+ vpor ymm2, ymm2, ymm0
+ vpor ymm3, ymm3, ymm1
+ vmovdqu ymm0, YMMWORD PTR [rcx+1536]
+ vpxor ymm0, ymm0, [rdx+1536]
+ vpor ymm2, ymm2, ymm0
+L_mlkem_cmp_avx2_done:
+ vpor ymm2, ymm2, ymm3
+ vptest ymm2, ymm2
+ cmovz eax, r9d
+ cmovnz eax, r10d
+ vzeroupper
+ ret
+mlkem_cmp_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_redistribute_21_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vmovdqu ymm8, YMMWORD PTR [rcx+256]
+ vmovdqu ymm9, YMMWORD PTR [rcx+288]
+ vmovdqu ymm10, YMMWORD PTR [rcx+320]
+ vmovdqu ymm11, YMMWORD PTR [rcx+352]
+ vpunpcklqdq ymm12, ymm0, ymm1
+ vpunpckhqdq ymm13, ymm0, ymm1
+ vpunpcklqdq ymm14, ymm2, ymm3
+ vpunpckhqdq ymm15, ymm2, ymm3
+ vperm2i128 ymm0, ymm12, ymm14, 32
+ vperm2i128 ymm1, ymm13, ymm15, 32
+ vperm2i128 ymm2, ymm12, ymm14, 49
+ vperm2i128 ymm3, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm4, ymm5
+ vpunpckhqdq ymm13, ymm4, ymm5
+ vpunpcklqdq ymm14, ymm6, ymm7
+ vpunpckhqdq ymm15, ymm6, ymm7
+ vperm2i128 ymm4, ymm12, ymm14, 32
+ vperm2i128 ymm5, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm7, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm8, ymm9
+ vpunpckhqdq ymm13, ymm8, ymm9
+ vpunpcklqdq ymm14, ymm10, ymm11
+ vpunpckhqdq ymm15, ymm10, ymm11
+ vperm2i128 ymm8, ymm12, ymm14, 32
+ vperm2i128 ymm9, ymm13, ymm15, 32
+ vperm2i128 ymm10, ymm12, ymm14, 49
+ vperm2i128 ymm11, ymm13, ymm15, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [rdx+64], ymm8
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r8+64], ymm9
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [r9+64], ymm10
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vmovdqu YMMWORD PTR [rax+64], ymm11
+ vmovdqu ymm0, YMMWORD PTR [rcx+384]
+ vmovdqu ymm1, YMMWORD PTR [rcx+416]
+ vmovdqu ymm2, YMMWORD PTR [rcx+448]
+ vmovdqu ymm3, YMMWORD PTR [rcx+480]
+ vmovdqu ymm4, YMMWORD PTR [rcx+512]
+ vmovdqu ymm5, YMMWORD PTR [rcx+544]
+ vmovdqu ymm6, YMMWORD PTR [rcx+576]
+ vmovdqu ymm7, YMMWORD PTR [rcx+608]
+ mov r10, QWORD PTR [rcx+640]
+ mov r11, QWORD PTR [rcx+648]
+ mov r12, QWORD PTR [rcx+656]
+ mov r13, QWORD PTR [rcx+664]
+ vpunpcklqdq ymm12, ymm0, ymm1
+ vpunpckhqdq ymm13, ymm0, ymm1
+ vpunpcklqdq ymm14, ymm2, ymm3
+ vpunpckhqdq ymm15, ymm2, ymm3
+ vperm2i128 ymm0, ymm12, ymm14, 32
+ vperm2i128 ymm1, ymm13, ymm15, 32
+ vperm2i128 ymm2, ymm12, ymm14, 49
+ vperm2i128 ymm3, ymm13, ymm15, 49
+ vpunpcklqdq ymm12, ymm4, ymm5
+ vpunpckhqdq ymm13, ymm4, ymm5
+ vpunpcklqdq ymm14, ymm6, ymm7
+ vpunpckhqdq ymm15, ymm6, ymm7
+ vperm2i128 ymm4, ymm12, ymm14, 32
+ vperm2i128 ymm5, ymm13, ymm15, 32
+ vperm2i128 ymm6, ymm12, ymm14, 49
+ vperm2i128 ymm7, ymm13, ymm15, 49
+ vmovdqu YMMWORD PTR [rdx+96], ymm0
+ vmovdqu YMMWORD PTR [rdx+128], ymm4
+ mov QWORD PTR [rdx+160], r10
+ vmovdqu YMMWORD PTR [r8+96], ymm1
+ vmovdqu YMMWORD PTR [r8+128], ymm5
+ mov QWORD PTR [r8+160], r11
+ vmovdqu YMMWORD PTR [r9+96], ymm2
+ vmovdqu YMMWORD PTR [r9+128], ymm6
+ mov QWORD PTR [r9+160], r12
+ vmovdqu YMMWORD PTR [rax+96], ymm3
+ vmovdqu YMMWORD PTR [rax+128], ymm7
+ mov QWORD PTR [rax+160], r13
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r13
+ pop r12
+ ret
+mlkem_redistribute_21_rand_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_redistribute_17_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ mov r10, QWORD PTR [rcx+512]
+ mov r11, QWORD PTR [rcx+520]
+ mov r12, QWORD PTR [rcx+528]
+ mov r13, QWORD PTR [rcx+536]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm4
+ mov QWORD PTR [rdx+128], r10
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm5
+ mov QWORD PTR [r8+128], r11
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ mov QWORD PTR [r9+128], r12
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm7
+ mov QWORD PTR [rax+128], r13
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop r13
+ pop r12
+ ret
+mlkem_redistribute_17_rand_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_redistribute_16_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vmovdqu ymm0, YMMWORD PTR [rcx+256]
+ vmovdqu ymm1, YMMWORD PTR [rcx+288]
+ vmovdqu ymm2, YMMWORD PTR [rcx+320]
+ vmovdqu ymm3, YMMWORD PTR [rcx+352]
+ vmovdqu ymm4, YMMWORD PTR [rcx+384]
+ vmovdqu ymm5, YMMWORD PTR [rcx+416]
+ vmovdqu ymm6, YMMWORD PTR [rcx+448]
+ vmovdqu ymm7, YMMWORD PTR [rcx+480]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx+64], ymm0
+ vmovdqu YMMWORD PTR [rdx+96], ymm4
+ vmovdqu YMMWORD PTR [r8+64], ymm1
+ vmovdqu YMMWORD PTR [r8+96], ymm5
+ vmovdqu YMMWORD PTR [r9+64], ymm2
+ vmovdqu YMMWORD PTR [r9+96], ymm6
+ vmovdqu YMMWORD PTR [rax+64], ymm3
+ vmovdqu YMMWORD PTR [rax+96], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop r13
+ pop r12
+ ret
+mlkem_redistribute_16_rand_avx2 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+mlkem_redistribute_8_rand_avx2 PROC
+ push r12
+ push r13
+ mov rax, QWORD PTR [rsp+56]
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ vmovdqu ymm4, YMMWORD PTR [rcx+128]
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vmovdqu ymm6, YMMWORD PTR [rcx+192]
+ vmovdqu ymm7, YMMWORD PTR [rcx+224]
+ vpunpcklqdq ymm8, ymm0, ymm1
+ vpunpckhqdq ymm9, ymm0, ymm1
+ vpunpcklqdq ymm10, ymm2, ymm3
+ vpunpckhqdq ymm11, ymm2, ymm3
+ vperm2i128 ymm0, ymm8, ymm10, 32
+ vperm2i128 ymm1, ymm9, ymm11, 32
+ vperm2i128 ymm2, ymm8, ymm10, 49
+ vperm2i128 ymm3, ymm9, ymm11, 49
+ vpunpcklqdq ymm8, ymm4, ymm5
+ vpunpckhqdq ymm9, ymm4, ymm5
+ vpunpcklqdq ymm10, ymm6, ymm7
+ vpunpckhqdq ymm11, ymm6, ymm7
+ vperm2i128 ymm4, ymm8, ymm10, 32
+ vperm2i128 ymm5, ymm9, ymm11, 32
+ vperm2i128 ymm6, ymm8, ymm10, 49
+ vperm2i128 ymm7, ymm9, ymm11, 49
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu YMMWORD PTR [rdx+32], ymm4
+ vmovdqu YMMWORD PTR [r8], ymm1
+ vmovdqu YMMWORD PTR [r8+32], ymm5
+ vmovdqu YMMWORD PTR [r9], ymm2
+ vmovdqu YMMWORD PTR [r9+32], ymm6
+ vmovdqu YMMWORD PTR [rax], ymm3
+ vmovdqu YMMWORD PTR [rax+32], ymm7
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop r13
+ pop r12
+ ret
+mlkem_redistribute_8_rand_avx2 ENDP
+_TEXT ENDS
+ENDIF
+ENDIF
+END
diff --git a/wolfssl-VS2022.vcxproj b/wolfssl-VS2022.vcxproj
index 81d32758e91..f15f6e9a598 100644
--- a/wolfssl-VS2022.vcxproj
+++ b/wolfssl-VS2022.vcxproj
@@ -57,6 +57,13 @@
wolfssl
+
+
+ false
+
StaticLibrary
v143
@@ -517,6 +524,28 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ true
+ true
+ true
+ true
+ true
+ true
+ true
+ true
+
false
false
@@ -573,6 +602,42 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
@@ -585,6 +650,11 @@
true
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj
index 44c23ab74ee..e00b6e6122a 100644
--- a/wolfssl.vcxproj
+++ b/wolfssl.vcxproj
@@ -56,6 +56,13 @@
Win32Proj
+
+
+ false
+
StaticLibrary
v110
@@ -517,6 +524,28 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ true
+ true
+ true
+ true
+ true
+ true
+ true
+ true
+
false
false
@@ -573,6 +602,42 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
@@ -585,6 +650,11 @@
true
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+
diff --git a/wrapper/CSharp/wolfssl.vcxproj b/wrapper/CSharp/wolfssl.vcxproj
index 7a963cbd913..396ff1c1dce 100644
--- a/wrapper/CSharp/wolfssl.vcxproj
+++ b/wrapper/CSharp/wolfssl.vcxproj
@@ -40,6 +40,13 @@
Win32Proj
+
+
+ false
+
StaticLibrary
v143
@@ -399,6 +406,24 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ true
+ true
+ true
+ true
+
false
false
@@ -455,6 +480,42 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
+
+ true
+ false
+ ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(IntDir)%(Filename).obj
+
@@ -465,6 +526,11 @@
true
+
+
+ USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions)
+
+