diff --git a/.github/workflows/aesgcm-siv.yml b/.github/workflows/aesgcm-siv.yml new file mode 100644 index 0000000000..9ff7ad1e8f --- /dev/null +++ b/.github/workflows/aesgcm-siv.yml @@ -0,0 +1,148 @@ +name: AES-GCM-SIV (RFC 8452) tests + +# START OF COMMON SECTION +on: + push: + branches: [ 'release/**' ] + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + branches: [ '*' ] + # Weekday-morning cron (10:00 UTC) seeds the master-scoped ccache that PR runs + # restore (cross job only); re-runs --build-only on the default branch. + schedule: + - cron: '40 10 * * 1-5' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +# END OF COMMON SECTION + +jobs: + # Native x86_64 'make check'. These are --enable-cryptonly (WOLFCRYPT_ONLY) + # builds, so check runs testwolfcrypt - which includes aesgcm_siv_test (the + # RFC 8452 KATs) - but not the TLS-only tests/unit.test (the tests/api group, + # test_wc_AesGcmSivEncryptDecrypt, needs a non-cryptonly build). One runner + # per config: + # - siv-c-only : no asm, exercises the software (table) POLYVAL + C CTR. + # - siv-intelasm : PCLMUL/AVX/VAES/AVX512 POLYVAL + pipelined CTR, whichever + # the runner CPU selects at runtime. + # - siv-all : SIV alongside --enable-all to catch integration regressions. + # - siv-smallstack : SIV's key schedules / derived keys live on the stack. + make_check: + strategy: + fail-fast: false + matrix: + config: + - '--enable-cryptonly --enable-aesgcm-siv' + - '--enable-cryptonly --enable-intelasm --enable-sp-asm --enable-aesgcm-siv' + - '--enable-cryptonly --enable-all-crypto --enable-intelasm --enable-sp-asm --enable-aesgcm-siv' + - '--enable-cryptonly --enable-aesgcm-siv --enable-smallstack' + name: make check (${{ matrix.config }}) + if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }} + runs-on: ubuntu-24.04 + timeout-minutes: 12 + steps: + - uses: actions/checkout@v5 + name: Checkout wolfSSL + + - name: Build and test AES-GCM-SIV + run: | + ./autogen.sh + ./configure ${{ matrix.config }} + make -j 4 + make check + + - name: Print errors + if: ${{ failure() }} + run: | + for file in scripts/*.log test-suite.log + do + if [ -f "$file" ]; then + echo "${file}:" + cat "$file" + echo "========================================================================" + fi + done + + # Cross-compiled AES-GCM-SIV asm paths, built out-of-tree in parallel and run + # under qemu-user (binfmt). Covers: + # - arm64-pmull : AArch64 PMULL POLYVAL (gcm_siv_arm64_crypto). + # - arm64-no-hw-crypto : AArch64 NEON 8-bit-pmul + table POLYVAL + # (gcm_siv_arm64_neon / _base) via WOLFSSL_ARMASM_NO_HW_CRYPTO. + # - armhf-crypto : ARMv8-A 32-bit vmull.p64 POLYVAL (gcm_siv_arm32_crypto); + # QEMU_CPU=max enables the crypto extensions. + # Thumb2 (gcm_siv_thumb2) targets armv7-m, which qemu-user cannot run, so it + # is covered by the bare-metal / membrowse builds, not here. + cross_check: + name: Cross-arch test + if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }} + runs-on: ubuntu-22.04 + timeout-minutes: 25 + steps: + - uses: actions/checkout@v5 + name: Checkout wolfSSL + + - name: Install dependencies + uses: ./.github/actions/install-apt-deps + with: + packages: autoconf automake libtool build-essential crossbuild-essential-arm64 crossbuild-essential-armhf qemu-user + ghcr-debs-tag: ubuntu-22.04-minimal + + - name: Set up ccache + uses: ./.github/actions/ccache-setup + with: + workflow-id: aesgcm-siv + read-only: ${{ github.event_name == 'pull_request' }} + max-size: 300M + + - name: Build all configs (parallel, out-of-tree) + run: | + cat > "$RUNNER_TEMP/aesgcm-siv-configs.json" <<'EOF' + [ + {"name": "arm64-pmull", "minutes": 6, + "cc": "ccache aarch64-linux-gnu-gcc", + "configure": ["--host=aarch64-linux-gnu", "--enable-cryptonly", + "--enable-all-crypto", "--disable-examples", "--enable-armasm", + "--enable-aesgcm-siv", "CFLAGS=-O2"], + "check": false, + "run": [["env", "QEMU_LD_PREFIX=/usr/aarch64-linux-gnu", "QEMU_CPU=max", + "./wolfcrypt/test/testwolfcrypt"]]}, + {"name": "arm64-no-hw-crypto", "minutes": 6, + "cc": "ccache aarch64-linux-gnu-gcc", + "configure": ["--host=aarch64-linux-gnu", "--enable-cryptonly", + "--enable-all-crypto", "--disable-examples", "--enable-armasm", + "--enable-aesgcm-siv", "CPPFLAGS=-DWOLFSSL_ARMASM_NO_HW_CRYPTO", + "CFLAGS=-O2"], + "check": false, + "run": [["env", "QEMU_LD_PREFIX=/usr/aarch64-linux-gnu", "QEMU_CPU=max", + "./wolfcrypt/test/testwolfcrypt"]]}, + {"name": "armhf-crypto", "minutes": 6, + "cc": "ccache arm-linux-gnueabihf-gcc", + "comment": "--disable-aesgcm-stream: WOLFSSL_AESGCM_STREAM's software GHASH only defines its macros for __aarch64__ armasm, not 32-bit __arm__ armasm, so all-crypto + armasm otherwise fails to build aes.c (pre-existing, unrelated to SIV).", + "configure": ["--host=arm-linux-gnueabihf", "--enable-cryptonly", + "--enable-all-crypto", "--disable-examples", "--enable-armasm", + "--enable-aesgcm-siv", "--disable-aesgcm-stream", "CFLAGS=-O2"], + "check": false, + "run": [["env", "QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf", "QEMU_CPU=max", + "./wolfcrypt/test/testwolfcrypt"]]} + ] + EOF + .github/scripts/parallel-make-check.py \ + ${{ github.event_name == 'schedule' && '--build-only' || '' }} \ + "$RUNNER_TEMP/aesgcm-siv-configs.json" + + - name: ccache stats + if: always() + run: ccache -s || true + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v6 + with: + retention-days: 7 + name: aesgcm-siv-cross-logs + path: | + build-*/make-check.log + build-*/test-suite.log + build-*/config.log + if-no-files-found: ignore diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ecbf16332..648c6dbdca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -930,6 +930,18 @@ if(WOLFSSL_AESSIV) list(APPEND WOLFSSL_DEFINITIONS "-DWOLFSSL_AES_SIV") endif() +# AES-GCM-SIV +add_option("WOLFSSL_AESGCMSIV" + "Enable AES-GCM-SIV (RFC 8452) support (default: disabled)" + "no" "yes;no") + +if(WOLFSSL_AESGCMSIV) + if(NOT WOLFSSL_AESGCM) + message(FATAL_ERROR "AES-GCM-SIV requires AES-GCM. Please enable WOLFSSL_AESGCM.") + endif() + list(APPEND WOLFSSL_DEFINITIONS "-DWOLFSSL_AESGCM_SIV") +endif() + # AES-CTR add_option("WOLFSSL_AESCTR" "Enable wolfSSL AES-CTR support (default: disabled)" diff --git a/configure.ac b/configure.ac index 2a58545fcc..6a09f79832 100644 --- a/configure.ac +++ b/configure.ac @@ -3400,6 +3400,25 @@ then ENABLED_AESSIV=yes fi +# AES-GCM-SIV (RFC 8452) +AC_ARG_ENABLE([aesgcm-siv], + [AS_HELP_STRING([--enable-aesgcm-siv],[Enable AES-GCM-SIV (RFC 8452) (default: disabled)])], + [ ENABLED_AESGCMSIV=$enableval ], + [ ENABLED_AESGCMSIV=no ] + ) + +if test "$ENABLED_AESGCMSIV" = "yes" +then + if test "$ENABLED_AESGCM" = "no" + then + AC_MSG_ERROR([AES-GCM-SIV requires AES-GCM. Please enable it (--enable-aesgcm).]) + fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AESGCM_SIV" + # The generated AES-GCM-SIV assembly (aes_gcm_asm.S) is guarded by + # WOLFSSL_AESGCM_SIV, so the assembler needs the define too. + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_AESGCM_SIV" +fi + # AES-CTR AC_ARG_ENABLE([aesctr], [AS_HELP_STRING([--enable-aesctr],[Enable wolfSSL AES-CTR support (default: disabled)])], @@ -12786,6 +12805,7 @@ echo " * AES-OFB: $ENABLED_AESOFB" echo " * AES-XTS: $ENABLED_AESXTS" echo " * AES-XTS streaming: $ENABLED_AESXTS_STREAM" echo " * AES-SIV: $ENABLED_AESSIV" +echo " * AES-GCM-SIV: $ENABLED_AESGCMSIV" echo " * AES-EAX: $ENABLED_AESEAX" echo " * AES Bitspliced: $ENABLED_AESBS" echo " * AES Key Wrap: $ENABLED_AESKEYWRAP" diff --git a/tests/api/test_aes.c b/tests/api/test_aes.c index 72221cd04a..f703770afb 100644 --- a/tests/api/test_aes.c +++ b/tests/api/test_aes.c @@ -3705,6 +3705,242 @@ int test_wc_AesGcmMixedEncDecLongIV(void) * AES-GCM non-standard nonce lengths ******************************************************************************/ +#if defined(WOLFSSL_AESGCM_SIV) && !defined(NO_AES) && defined(HAVE_AESGCM) +/* Decode a hex string into 'out' (spaces ignored, NULL -> length 0). + * Returns the byte length, or -1 on a malformed string / overflow. */ +static int gcmsiv_hex(const char* hex, byte* out, word32 max) +{ + word32 n = 0; + int hi, lo; + + if (hex == NULL) + return 0; + while (*hex != '\0') { + if (*hex == ' ') { hex++; continue; } + if (hex[1] == '\0') return -1; + hi = (*hex >= '0' && *hex <= '9') ? *hex - '0' : + (*hex >= 'a' && *hex <= 'f') ? *hex - 'a' + 10 : + (*hex >= 'A' && *hex <= 'F') ? *hex - 'A' + 10 : -1; + hex++; + lo = (*hex >= '0' && *hex <= '9') ? *hex - '0' : + (*hex >= 'a' && *hex <= 'f') ? *hex - 'a' + 10 : + (*hex >= 'A' && *hex <= 'F') ? *hex - 'A' + 10 : -1; + hex++; + if (hi < 0 || lo < 0 || n >= max) return -1; + out[n++] = (byte)((hi << 4) | lo); + } + return (int)n; +} + +/* Run one RFC 8452 known-answer vector (expH = ciphertext || 16-byte tag): + * encrypt and check ciphertext+tag, then decrypt and check the recovered + * plaintext and that authentication succeeds. Returns 0 on a full match, + * or a negative step code on the first failure. */ +static int gcmsiv_kat(const char* keyH, const char* nonceH, const char* aadH, + const char* ptH, const char* expH) +{ + byte key[32], nonce[12], aad[64], pt[64], exp[96], ct[80], tag[16], dec[64]; + int keySz, nonceSz, aadSz, ptSz, expSz; + + keySz = gcmsiv_hex(keyH, key, (word32)sizeof(key)); + nonceSz = gcmsiv_hex(nonceH, nonce, (word32)sizeof(nonce)); + aadSz = gcmsiv_hex(aadH, aad, (word32)sizeof(aad)); + ptSz = gcmsiv_hex(ptH, pt, (word32)sizeof(pt)); + expSz = gcmsiv_hex(expH, exp, (word32)sizeof(exp)); + if (keySz < 0 || nonceSz < 0 || aadSz < 0 || ptSz < 0 || expSz < 0) + return -1; + if (expSz != ptSz + 16) + return -2; + + if (wc_AesGcmSivEncrypt(key, (word32)keySz, nonce, (word32)nonceSz, + aad, (word32)aadSz, pt, (word32)ptSz, ct, tag, 16) != 0) + return -3; + if (ptSz > 0 && XMEMCMP(ct, exp, (size_t)ptSz) != 0) + return -4; + if (XMEMCMP(tag, exp + ptSz, 16) != 0) + return -5; + + if (wc_AesGcmSivDecrypt(key, (word32)keySz, nonce, (word32)nonceSz, + aad, (word32)aadSz, ct, (word32)ptSz, dec, tag, 16) != 0) + return -6; + if (ptSz > 0 && XMEMCMP(dec, pt, (size_t)ptSz) != 0) + return -7; + + return 0; +} +#endif /* WOLFSSL_AESGCM_SIV ... */ + +/* + * AES-GCM-SIV (RFC 8452): RFC 8452 Appendix C known-answer vectors (encrypt and + * decrypt), a full encrypt/decrypt round trip, authentication-failure handling + * (corrupted tag / AAD / ciphertext), edge cases (empty plaintext, empty AAD, + * AES-128 and AES-256), and an exhaustive invalid-parameter matrix. + */ +int test_wc_AesGcmSivEncryptDecrypt(void) +{ + EXPECT_DECLS; +#if defined(WOLFSSL_AESGCM_SIV) && !defined(NO_AES) && defined(HAVE_AESGCM) && \ + defined(WOLFSSL_AES_128) + byte key[32]; + byte nonce[12]; + byte aad[20]; + byte pt[32]; + byte ct[32]; + byte tag[16]; + byte dec[32]; + word32 i; + + /* --- RFC 8452 Appendix C.1 known-answer vectors (AES-128) --- */ + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", NULL, NULL, + "dc20e2d83f25705bb49e439eca56de25"), 0); + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", NULL, "0100000000000000", + "b5d839330ac7b786578782fff6013b815b287c22493a364c"), 0); + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", NULL, "010000000000000000000000", + "7323ea61d05932260047d942a4978db357391a0bc4fdec8b0d106639"), 0); + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", "01", "0200000000000000", + "1e6daba35669f4273b0a1a2560969cdf790d99759abd1508"), 0); + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", "01", + "02000000000000000000000000000000", + "e2b0c5da79a901c1745f700525cb335b8f8936ec039e4e4bb97ebd8c4457441f"), 0); + /* Non-block-aligned AAD (18 B) and plaintext (20 B). */ + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", + "010000000000000000000000000000000200", + "030000000000000000000000000000000400 0000", + "6bb0fecf5ded9b77f902c7d5da236a4391dd029724afc9805e976f451e6d87f6" + "fe106514"), 0); + /* Non-block-aligned AAD (20 B) and plaintext (18 B). */ + ExpectIntEQ(gcmsiv_kat("01000000000000000000000000000000", + "030000000000000000000000", + "0100000000000000000000000000000002000000", + "030000000000000000000000000000000400", + "44d0aaf6fb2f1f34add5e8064e83e12a2adabff9b2ef00fb47920cc72a0c0f13" + "b9fd"), 0); + +#ifdef WOLFSSL_AES_256 + /* --- RFC 8452 Appendix C.2 known-answer vectors (AES-256) --- */ + ExpectIntEQ(gcmsiv_kat( + "0100000000000000000000000000000000000000000000000000000000000000", + "030000000000000000000000", NULL, NULL, + "07f5f4169bbf55a8400cd47ea6fd400f"), 0); + ExpectIntEQ(gcmsiv_kat( + "0100000000000000000000000000000000000000000000000000000000000000", + "030000000000000000000000", NULL, "0100000000000000", + "c2ef328e5c71c83b843122130f7364b761e0b97427e3df28"), 0); + ExpectIntEQ(gcmsiv_kat( + "0100000000000000000000000000000000000000000000000000000000000000", + "030000000000000000000000", NULL, "010000000000000000000000", + "9aab2aeb3faa0a34aea8e2b18ca50da9ae6559e48fd10f6e5c9ca17e"), 0); +#endif /* WOLFSSL_AES_256 */ + + /* --- Round trip + authentication-failure handling (AES-128) --- */ + for (i = 0; i < (word32)sizeof(key); i++) key[i] = (byte)i; + for (i = 0; i < (word32)sizeof(nonce); i++) nonce[i] = (byte)(i + 1); + for (i = 0; i < (word32)sizeof(aad); i++) aad[i] = (byte)(i + 2); + for (i = 0; i < (word32)sizeof(pt); i++) pt[i] = (byte)(i + 3); + + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), 0); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), 0); + ExpectIntEQ(XMEMCMP(pt, dec, sizeof(pt)), 0); + + /* Corrupted tag -> authentication failure. */ + tag[0] ^= 0xff; + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(AES_GCM_AUTH_E)); + tag[0] ^= 0xff; + /* Corrupted AAD -> authentication failure. */ + aad[0] ^= 0xff; + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(AES_GCM_AUTH_E)); + aad[0] ^= 0xff; + /* Corrupted ciphertext -> authentication failure. */ + ct[0] ^= 0xff; + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(AES_GCM_AUTH_E)); + ct[0] ^= 0xff; + + /* Edge cases: empty plaintext (tag only) and empty AAD. */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, NULL, 0, NULL, 0, + NULL, tag, 16), 0); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, NULL, 0, NULL, 0, + NULL, tag, 16), 0); + +#ifdef WOLFSSL_AES_256 + /* AES-256 round trip. */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 32, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), 0); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 32, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), 0); + ExpectIntEQ(XMEMCMP(pt, dec, sizeof(pt)), 0); +#endif + + /* --- Invalid parameters: encrypt --- */ + ExpectIntEQ(wc_AesGcmSivEncrypt(NULL, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, NULL, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, NULL, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + /* in/out NULL while inSz != 0 */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + NULL, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), NULL, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + /* aad NULL while aadSz != 0 */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, NULL, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + /* invalid key sizes (only 16 and 32 allowed; 24/AES-192 is rejected) */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 0, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 15, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 24, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 33, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + /* invalid nonce sizes (must be exactly 12) */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 0, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 11, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 13, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + /* invalid tag sizes (must be exactly 16) */ + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 15), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivEncrypt(key, 16, nonce, 12, aad, sizeof(aad), + pt, sizeof(pt), ct, tag, 17), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + + /* --- Invalid parameters: decrypt --- */ + ExpectIntEQ(wc_AesGcmSivDecrypt(NULL, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, NULL, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, NULL, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + NULL, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), NULL, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, NULL, 12, NULL, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 24, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 13, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 16), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesGcmSivDecrypt(key, 16, nonce, 12, aad, sizeof(aad), + ct, sizeof(pt), dec, tag, 17), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); +#endif /* WOLFSSL_AESGCM_SIV && !NO_AES && HAVE_AESGCM && WOLFSSL_AES_128 */ + return EXPECT_RESULT(); +} + /* * Non-standard (non-96-bit) nonce tests for AES-GCM. * diff --git a/tests/api/test_aes.h b/tests/api/test_aes.h index 73e4b715ac..db31edd242 100644 --- a/tests/api/test_aes.h +++ b/tests/api/test_aes.h @@ -51,6 +51,7 @@ int test_wc_AesGcmEncryptDecrypt_UnalignedBuffers(void); int test_wc_AesGcm_CrossCipher(void); int test_wc_AesGcmMixedEncDecLongIV(void); int test_wc_AesGcmNonStdNonce(void); +int test_wc_AesGcmSivEncryptDecrypt(void); int test_wc_AesGcmStream(void); int test_wc_AesGcmStream_MidStreamState(void); int test_wc_AesGcmStream_ReinitAfterFinal(void); @@ -149,6 +150,7 @@ int test_wc_CryptoCb_Tls13_Key_No_Zero_Without_Offload(void); TEST_DECL_GROUP("aes", test_wc_AesGcm_CrossCipher), \ TEST_DECL_GROUP("aes", test_wc_AesGcmMixedEncDecLongIV), \ TEST_DECL_GROUP("aes", test_wc_AesGcmNonStdNonce), \ + TEST_DECL_GROUP("aes", test_wc_AesGcmSivEncryptDecrypt), \ TEST_DECL_GROUP("aes", test_wc_AesGcmStream), \ TEST_DECL_GROUP("aes", test_wc_AesGcmStream_MidStreamState), \ TEST_DECL_GROUP("aes", test_wc_AesGcmStream_ReinitAfterFinal), \ diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index a58fd300a3..3a045ee422 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -829,6 +829,7 @@ static WC_INLINE void bench_append_memory_info(char* buffer, size_t size, #define BENCH_SM4_GCM 0x00100000 #define BENCH_SM4_CCM 0x00200000 #define BENCH_SM4 (BENCH_SM4_CBC | BENCH_SM4_GCM | BENCH_SM4_CCM) +#define BENCH_AESGCM_SIV 0x00800000 /* Digest algorithms. */ #define BENCH_MD5 0x00000001 #define BENCH_POLY1305 0x00000002 @@ -1066,6 +1067,9 @@ static const bench_alg bench_cipher_opt[] = { #ifdef WOLFSSL_AES_SIV { "-aes-siv", BENCH_AES_SIV }, #endif +#ifdef WOLFSSL_AESGCM_SIV + { "-aesgcm-siv", BENCH_AESGCM_SIV }, +#endif #ifdef HAVE_CAMELLIA { "-camellia", BENCH_CAMELLIA }, #endif @@ -4033,6 +4037,10 @@ static void* benchmarks_do(void* args) if (bench_all || (bench_cipher_algs & BENCH_AES_SIV)) bench_aessiv(); #endif +#ifdef WOLFSSL_AESGCM_SIV + if (bench_all || (bench_cipher_algs & BENCH_AESGCM_SIV)) + bench_aesgcmsiv(); +#endif #endif /* !NO_AES */ #ifdef HAVE_CAMELLIA @@ -6904,6 +6912,86 @@ void bench_aessiv(void) bench_aessiv_internal(bench_key, 64, "AES-512-SIV-enc", "AES-512-SIV-dec"); } #endif /* WOLFSSL_AES_SIV */ + +#ifdef WOLFSSL_AESGCM_SIV +static void bench_aesgcmsiv_internal(const byte* key, word32 keySz, const char* + encLabel, const char* decLabel) +{ + int i; + int ret = 0; + byte nonce[12]; + byte additional[AES_AUTH_ADD_SZ]; + byte tag[WC_AES_BLOCK_SIZE]; + int count = 0; + double start; + DECLARE_MULTI_VALUE_STATS_VARS() + + XMEMSET(nonce, 0, sizeof(nonce)); + XMEMSET(additional, 0, sizeof(additional)); + + bench_stats_prepare(); + + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_AesGcmSivEncrypt(key, keySz, nonce, sizeof(nonce), + additional, aesAuthAddSz, + bench_plain, bench_size, bench_cipher, + tag, sizeof(tag)); + if (ret != 0) { + printf("wc_AesGcmSivEncrypt failed (%d)\n", ret); + return; + } + RECORD_MULTI_VALUE_STATS(); + } + count += i; + } while (bench_stats_check(start) +#ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs +#endif + ); + + bench_stats_sym_finish(encLabel, 0, count, bench_size, start, ret); +#ifdef MULTI_VALUE_STATISTICS + bench_multi_value_stats(max, min, sum, squareSum, runs); +#endif + + RESET_MULTI_VALUE_STATS_VARS(); + + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_AesGcmSivDecrypt(key, keySz, nonce, sizeof(nonce), + additional, aesAuthAddSz, + bench_cipher, bench_size, bench_plain, + tag, sizeof(tag)); + if (ret != 0) { + printf("wc_AesGcmSivDecrypt failed (%d)\n", ret); + return; + } + RECORD_MULTI_VALUE_STATS(); + } + count += i; + } while (bench_stats_check(start) +#ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs +#endif + ); + + bench_stats_sym_finish(decLabel, 0, count, bench_size, start, ret); +#ifdef MULTI_VALUE_STATISTICS + bench_multi_value_stats(max, min, sum, squareSum, runs); +#endif +} + +void bench_aesgcmsiv(void) +{ + bench_aesgcmsiv_internal(bench_key, 16, "AES-128-GCM-SIV-enc", + "AES-128-GCM-SIV-dec"); + bench_aesgcmsiv_internal(bench_key, 32, "AES-256-GCM-SIV-enc", + "AES-256-GCM-SIV-dec"); +} +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !NO_AES */ diff --git a/wolfcrypt/benchmark/benchmark.h b/wolfcrypt/benchmark/benchmark.h index 067dd61749..272e657843 100644 --- a/wolfcrypt/benchmark/benchmark.h +++ b/wolfcrypt/benchmark/benchmark.h @@ -59,6 +59,7 @@ void bench_aesctr(int useDeviceID); void bench_aescfb(void); void bench_aesofb(void); void bench_aessiv(void); +void bench_aesgcmsiv(void); void bench_poly1305(void); void bench_camellia(void); void bench_sm4_cbc(void); diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 62b7d8ab84..d402385bed 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -51,6 +51,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits * WOLFSSL_AES_XTS: Enable AES-XTS mode default: off * WOLFSSL_AES_CTS: Enable AES-CTS (ciphertext stealing) default: off * WOLFSSL_AES_SIV: Enable AES-SIV (synthetic IV) mode default: off + * WOLFSSL_AESGCM_SIV: Enable AES-GCM-SIV (RFC 8452) mode default: off * WOLFSSL_AES_EAX: Enable AES-EAX AEAD mode default: off * WOLFSSL_CMAC: Enable AES-CMAC (RFC 4493) default: off * HAVE_AESCCM: Enable AES-CCM mode default: off @@ -1126,7 +1127,7 @@ static void Check_CPU_support_HwCrypto(Aes* aes) #endif /* __aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESCCM) || \ - defined(WOLFSSL_AESGCM_STREAM) + defined(WOLFSSL_AESGCM_STREAM) || defined(WOLFSSL_AESGCM_SIV) static WARN_UNUSED_RESULT int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) { @@ -18027,6 +18028,1122 @@ int wc_AesSivDecrypt_ex(const byte* key, word32 keySz, const AesSivAssoc* assoc, #endif /* WOLFSSL_AES_SIV */ +#ifdef WOLFSSL_AESGCM_SIV + +/* AES-GCM-SIV - a nonce misuse-resistant AEAD. See RFC 8452. + * + * The implementation here is portable C. AES block operations reuse the + * internal wc_AesEncrypt(), so HAVE_AESGCM is required for that to be built. + */ +#ifndef HAVE_AESGCM + #error "WOLFSSL_AESGCM_SIV requires HAVE_AESGCM" +#endif + +#define AES_GCM_SIV_NONCE_SZ 12 +#define AES_GCM_SIV_TAG_SZ WC_AES_BLOCK_SIZE + +#ifndef GCM_SMALL +/* GF(2^128) reduction table used by the table-based software multiplies; not + * needed by the table-free GCM_SMALL variant. R[a] is the contribution, to the + * top two bytes, of reducing a nibble 'a' shifted out past x^127 (the GHASH + * polynomial x^128+x^7+x^2+x+1). Same table wolfSSL uses for table GHASH. */ +static const byte AES_GCM_SIV_R[16][2] = { + {0x00, 0x00}, {0x1c, 0x20}, {0x38, 0x40}, {0x24, 0x60}, + {0x70, 0x80}, {0x6c, 0xa0}, {0x48, 0xc0}, {0x54, 0xe0}, + {0xe1, 0x00}, {0xfd, 0x20}, {0xd9, 0x40}, {0xc5, 0x60}, + {0x91, 0x80}, {0x8d, 0xa0}, {0xa9, 0xc0}, {0xb5, 0xe0}, +}; +#endif + +/* Reverse the order of the 16 bytes of a block. in and out must not alias. */ +static WC_INLINE void AesGcmSivByteReverse(byte* out, const byte* in) +{ +#if !defined(WOLFSSL_USE_ALIGN) && defined(WORD64_AVAILABLE) + /* Unaligned word access is permitted: reverse eight bytes at a time with a + * hardware byte-swap rather than one byte at a time. Endian independent - + * load native, reverse the value's bytes, store native: that reverses the + * bytes in memory on both little- and big-endian. */ + word64 lo, hi; + XMEMCPY(&lo, in, sizeof(lo)); + XMEMCPY(&hi, in + 8, sizeof(hi)); + lo = ByteReverseWord64(lo); + hi = ByteReverseWord64(hi); + XMEMCPY(out, &hi, sizeof(hi)); + XMEMCPY(out + 8, &lo, sizeof(lo)); +#else + int i; + for (i = 0; i < WC_AES_BLOCK_SIZE; i++) { + out[i] = in[WC_AES_BLOCK_SIZE - 1 - i]; + } +#endif +} + +/* Multiply a GF(2^128) element (GHASH bit order: the most-significant bit of + * byte 0 is the x^0 coefficient) by x: shift the 128-bit value right by one + * and reduce with the GHASH polynomial. Branch free, so constant time. Used by + * the GCM_SMALL and byte-table multiplies. */ +static WC_INLINE void AesGcmSivMulX(byte* x) +{ + int i; + byte carryIn = 0; + byte borrow = (byte)((0x00U - (x[WC_AES_BLOCK_SIZE - 1] & 0x01U)) & 0xE1U); + + for (i = 0; i < WC_AES_BLOCK_SIZE; i++) { + byte carryOut = (byte)((x[i] & 0x01) << 7); + x[i] = (byte)((x[i] >> 1) | carryIn); + carryIn = carryOut; + } + x[0] ^= borrow; +} + +/* POLYVAL state (RFC 8452 Section 3). POLYVAL is GHASH on byte-reversed inputs, + * so the field is the GHASH field with the most-significant bit of byte 0 the + * x^0 coefficient (see RFC 8452 Appendix A). The key is one of: + * - GCM_SMALL: the 16-byte key (table-free, smallest footprint). + * - word64: a Shoup 4-bit table (256 bytes), word64 multiply - used when a + * 64-bit type is available and GCM_WORD32 is not requested. + * - word32: the same 4-bit table, word32 multiply - used for GCM_WORD32 or + * when no 64-bit type is available. + * + * Every variant reads the message, key and running sum a byte at a time and + * (the word64/word32 variants) load/store their words with explicit shifts or + * a byte-swap rather than casting buffers, so all are independent of platform + * endianness; the word loads also respect WOLFSSL_USE_ALIGN, so input, key and + * output buffers may be little- or big-endian and aligned or unaligned. + */ +/* When the generated x86_64 AES-NI/PCLMUL POLYVAL multiply is available + * (aes_gcm_asm.S), the per-block multiply can be offloaded to it at runtime. + * This is the generated external assembly - no assembly lives in this file. */ +#if defined(WOLFSSL_AESNI) && defined(WOLFSSL_X86_64_BUILD) + #define WC_POLYVAL_ASM +#ifdef __cplusplus + extern "C" { +#endif + /* s += POLYVAL of 'blocks' 16-byte blocks of data, hash key h prepared as + * the byte-reversed mulX_GHASH(ByteReverse(authKey)); s is POLYVAL byte + * order. */ + void AES_GCMSIV_polyval_aesni(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_aesni"); +#ifdef HAVE_INTEL_AVX1 + void AES_GCMSIV_polyval_avx1(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_avx1"); +#endif +#ifdef HAVE_INTEL_VAES + /* Aggregated 2-blocks-per-ymm POLYVAL (VPCLMULQDQ). */ + void AES_GCMSIV_polyval_vaes(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_vaes"); +#endif +#ifdef HAVE_INTEL_AVX512 + /* Aggregated 4-blocks-per-zmm POLYVAL (VPCLMULQDQ). */ + void AES_GCMSIV_polyval_avx512(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_avx512"); +#endif + /* AES-GCM-SIV CTR keystream (RFC 8452): a 32-bit little-endian counter in + * the first 4 bytes of the block (mod 2^32, no carry), block used directly + * as the AES input. Encrypts the full-16-byte-block portion of 'length' + * bytes (pipelined), advancing and writing 'ctr' back. */ + #define WC_GCMSIV_CTR_ASM + void AES_GCMSIV_ctr_aesni(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_aesni"); +#ifdef HAVE_INTEL_AVX1 + void AES_GCMSIV_ctr_avx1(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_avx1"); +#endif +#ifdef HAVE_INTEL_VAES + void AES_GCMSIV_ctr_vaes(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_vaes"); +#endif +#ifdef HAVE_INTEL_AVX512 + void AES_GCMSIV_ctr_avx512(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_avx512"); +#endif +#ifdef __cplusplus + } +#endif +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) + /* The generated AArch64 POLYVAL multiplies (armv8-aes-asm.S) offload the + * per-block multiply: PMULL when the CPU has the crypto extension, else the + * 8-bit-pmul NEON variant, else the scalar (base) variant. This is the + * generated external assembly - no assembly lives here. */ + #define WC_POLYVAL_ASM + #define WC_POLYVAL_ASM_AARCH64 + /* The base (scalar) variant multiplies through the word64 software table + * poly->m, so it is only available when that table is built. */ + #if defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) && \ + !defined(GCM_SMALL) && !defined(WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP) + #define WC_POLYVAL_ASM_AARCH64_BASE + #endif +#ifdef __cplusplus + extern "C" { +#endif + /* s += POLYVAL of 'blocks' 16-byte blocks of data. For the PMULL and NEON + * variants h is the prepared key (byte-reversed mulX_GHASH(ByteReverse( + * authKey))); for the base variant h is the word64 table poly->m. s is in + * POLYVAL byte order in every case. */ +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + void AES_GCMSIV_polyval_pmull(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_pmull"); +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON + void AES_GCMSIV_polyval_neon(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_neon"); +#endif +#ifdef WC_POLYVAL_ASM_AARCH64_BASE + void AES_GCMSIV_polyval_base(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_base"); +#endif + /* AES-GCM-SIV CTR keystream (RFC 8452): 32-bit little-endian counter in the + * first 4 bytes of the block, mod 2^32, block used directly. Full-block + * portion only (the C tail finishes any partial block). The crypto variant + * pipelines aese; the NEON/base variants pipeline software table AES. KS is + * the AES key schedule in every case. */ + #define WC_GCMSIV_CTR_ASM + #define WC_GCMSIV_CTR_ASM_AARCH64 +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + void AES_GCMSIV_ctr_aarch64(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_aarch64"); +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON + void AES_GCMSIV_ctr_neon(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_neon"); +#endif +#ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + void AES_GCMSIV_ctr_base(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_base"); +#endif +#ifdef __cplusplus + } +#endif +#elif defined(WOLFSSL_ARMASM) && !defined(__aarch64__) && \ + !defined(WOLFSSL_ARMASM_THUMB2) + /* AArch32 (32-bit ARM). The generated armv8-32-aes-asm.S provides POLYVAL + * and CTR for the crypto (vmull.p64 / aese) and base (table) variants. + * crypto-vs-base is compile-time (WOLFSSL_ARMASM_NO_HW_CRYPTO) with no + * runtime fallback - the same as the rest of the AArch32 AES. */ + #define WC_POLYVAL_ASM + #define WC_POLYVAL_ASM_AARCH32 + #define WC_GCMSIV_CTR_ASM + #define WC_GCMSIV_CTR_ASM_AARCH32 + /* The base POLYVAL multiplies through the word64 software table poly->m. It + * is only compiled in the base (no-crypto) build and only when that table + * is built. */ + #if defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) && defined(WORD64_AVAILABLE) && \ + !defined(GCM_WORD32) && !defined(GCM_SMALL) + #define WC_POLYVAL_ASM_AARCH32_BASE + #endif +#ifdef __cplusplus + extern "C" { +#endif +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + void AES_GCMSIV_polyval_crypto(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_crypto"); + void AES_GCMSIV_ctr_crypto(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_crypto"); +#else +#ifdef WC_POLYVAL_ASM_AARCH32_BASE + void AES_GCMSIV_polyval_base(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_base"); +#endif + void AES_GCMSIV_ctr_base(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_base"); +#endif +#ifdef __cplusplus + } +#endif +#elif defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_THUMB2) + /* Thumb-2 (32-bit ARM, Thumb-2 encoding). A single table-based variant + * (ported from the AArch32 base): POLYVAL multiplies through the word64 + * software table poly->m; CTR is the table AES with the SIV counter. */ + #define WC_GCMSIV_CTR_ASM + #define WC_GCMSIV_CTR_ASM_THUMB2 + #if defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) && !defined(GCM_SMALL) + #define WC_POLYVAL_ASM + #define WC_POLYVAL_ASM_THUMB2 + #endif +#ifdef __cplusplus + extern "C" { +#endif +#ifdef WC_POLYVAL_ASM_THUMB2 + void AES_GCMSIV_polyval_thumb2(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks) + XASM_LINK("AES_GCMSIV_polyval_thumb2"); +#endif + void AES_GCMSIV_ctr_thumb2(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_GCMSIV_ctr_thumb2"); +#ifdef __cplusplus + } +#endif +#endif + +#ifdef WC_POLYVAL_ASM + typedef void (*AesGcmSivPolyvalFn)(unsigned char* s, const unsigned char* h, + const unsigned char* data, word32 blocks); +#endif +#ifdef WC_GCMSIV_CTR_ASM + typedef void (*AesGcmSivCtrFn)(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr); +#endif + +typedef struct AesGcmSivPolyval { +#ifdef WC_POLYVAL_ASM + byte hHw[WC_AES_BLOCK_SIZE]; /* prepared key for the asm multiply */ + const byte* asmKey; /* key passed to fn: hHw, or the table below */ + AesGcmSivPolyvalFn fn; /* asm multiply, or NULL for software */ +#endif +#if defined(GCM_SMALL) + byte h[WC_AES_BLOCK_SIZE]; /* hash key = mulX_GHASH(ByteReverse(H)) */ +#elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) + word64 m[16][2]; /* m[i] = i * mulX_GHASH(ByteReverse(H)) */ +#else + word32 m[16][4]; /* m[i] = i * mulX_GHASH(ByteReverse(H)) */ +#endif + byte s[WC_AES_BLOCK_SIZE]; /* running sum, GHASH representation */ +} AesGcmSivPolyval; + +#if defined(GCM_SMALL) + +/* s = s * h with no precomputed table: decompose h bit-by-bit and accumulate + * shifted copies of s. Mirrors wolfSSL's GCM_SMALL GMULT. */ +static void AesGcmSivGMult(AesGcmSivPolyval* poly) +{ + byte Z[WC_AES_BLOCK_SIZE]; + byte V[WC_AES_BLOCK_SIZE]; + int i, j; + + XMEMSET(Z, 0, sizeof(Z)); + XMEMCPY(V, poly->s, WC_AES_BLOCK_SIZE); + for (i = 0; i < WC_AES_BLOCK_SIZE; i++) { + byte y = poly->h[i]; + for (j = 0; j < 8; j++) { + if (y & 0x80) { + xorbuf(Z, V, WC_AES_BLOCK_SIZE); + } + AesGcmSivMulX(V); + y = (byte)(y << 1); + } + } + XMEMCPY(poly->s, Z, WC_AES_BLOCK_SIZE); +} + +/* Store the hash key mulX_GHASH(ByteReverse(h)); no table to build. */ +static void AesGcmSivPolyvalInitSw(AesGcmSivPolyval* poly, const byte* h) +{ + AesGcmSivByteReverse(poly->h, h); + AesGcmSivMulX(poly->h); + XMEMSET(poly->s, 0, sizeof(poly->s)); +} + +#elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) + +/* Load/store a big-endian word64 - the high word is bytes 0..7 of the block, + * so byte 0 (the x^0..x^7 coefficients) is the most-significant byte. + * + * Where unaligned word access is permitted (!WOLFSSL_USE_ALIGN) this is a + * single word64 load/store plus a hardware byte-swap on little-endian; where + * alignment is required it is assembled a byte at a time. Both forms are + * endian independent. */ +#ifndef WOLFSSL_USE_ALIGN +static WC_INLINE word64 AesGcmSivLoad64(const byte* b) +{ + word64 v; + XMEMCPY(&v, b, sizeof(v)); +#ifdef LITTLE_ENDIAN_ORDER + v = ByteReverseWord64(v); +#endif + return v; +} +static WC_INLINE void AesGcmSivStore64(byte* b, word64 v) +{ +#ifdef LITTLE_ENDIAN_ORDER + v = ByteReverseWord64(v); +#endif + XMEMCPY(b, &v, sizeof(v)); +} +#else +static WC_INLINE word64 AesGcmSivLoad64(const byte* b) +{ + return ((word64)b[0] << 56) | ((word64)b[1] << 48) | + ((word64)b[2] << 40) | ((word64)b[3] << 32) | + ((word64)b[4] << 24) | ((word64)b[5] << 16) | + ((word64)b[6] << 8) | ((word64)b[7]); +} +static WC_INLINE void AesGcmSivStore64(byte* b, word64 v) +{ + b[0] = (byte)(v >> 56); b[1] = (byte)(v >> 48); + b[2] = (byte)(v >> 40); b[3] = (byte)(v >> 32); + b[4] = (byte)(v >> 24); b[5] = (byte)(v >> 16); + b[6] = (byte)(v >> 8); b[7] = (byte)(v); +} +#endif + +/* Multiply the 128-bit value (hi,lo) by x and reduce: a right shift by one of + * the whole value, XOR-ing the reduction polynomial (0xe1 into byte 0) when a + * one is shifted out past x^127 (the low bit of lo). */ +static WC_INLINE void AesGcmSivMulX64(word64* hi, word64* lo) +{ + word64 carry = *lo & 1; + *lo = (*lo >> 1) | (*hi << 63); + *hi = (*hi >> 1) ^ (W64LIT(0xe100000000000000) & (word64)(0 - carry)); +} + +/* s = s * H. The accumulator is shifted right a nibble at a time; the nibble + * that falls off is reduced through AES_GCM_SIV_R into the top two bytes. */ +static void AesGcmSivGMult(AesGcmSivPolyval* poly) +{ + byte* x = poly->s; + word64 (*m)[2] = poly->m; + word64 zHi = 0, zLo = 0; + int i; + + for (i = WC_AES_BLOCK_SIZE - 1; i >= 0; i--) { + byte xi = x[i]; + byte a; + + /* low nibble */ + zHi ^= m[xi & 0xf][0]; + zLo ^= m[xi & 0xf][1]; + a = (byte)(zLo & 0xf); + zLo = (zLo >> 4) | (zHi << 60); + zHi = zHi >> 4; + zHi ^= ((word64)AES_GCM_SIV_R[a][0] << 56) | + ((word64)AES_GCM_SIV_R[a][1] << 48); + + /* high nibble */ + zHi ^= m[xi >> 4][0]; + zLo ^= m[xi >> 4][1]; + if (i == 0) { + break; + } + a = (byte)(zLo & 0xf); + zLo = (zLo >> 4) | (zHi << 60); + zHi = zHi >> 4; + zHi ^= ((word64)AES_GCM_SIV_R[a][0] << 56) | + ((word64)AES_GCM_SIV_R[a][1] << 48); + } + + AesGcmSivStore64(x, zHi); + AesGcmSivStore64(x + 8, zLo); +} + +/* Build the 4-bit table for mulX_GHASH(ByteReverse(h)). */ +static void AesGcmSivPolyvalInitSw(AesGcmSivPolyval* poly, const byte* h) +{ + byte hrev[WC_AES_BLOCK_SIZE]; + word64 (*m)[2] = poly->m; + int i; + + /* m[8] = 1 * H = mulX_GHASH(ByteReverse(h)); successive halvings give the + * power-of-two nibble entries. */ + AesGcmSivByteReverse(hrev, h); + m[0x8][0] = AesGcmSivLoad64(hrev); + m[0x8][1] = AesGcmSivLoad64(hrev + 8); + AesGcmSivMulX64(&m[0x8][0], &m[0x8][1]); + m[0x4][0] = m[0x8][0]; m[0x4][1] = m[0x8][1]; AesGcmSivMulX64(&m[0x4][0], &m[0x4][1]); + m[0x2][0] = m[0x4][0]; m[0x2][1] = m[0x4][1]; AesGcmSivMulX64(&m[0x2][0], &m[0x2][1]); + m[0x1][0] = m[0x2][0]; m[0x1][1] = m[0x2][1]; AesGcmSivMulX64(&m[0x1][0], &m[0x1][1]); + + /* The rest are sums of those basis entries (i = high bit + remainder). */ + m[0x0][0] = 0; m[0x0][1] = 0; + for (i = 0; i < 16; i++) { + static const byte hibit[16] = + { 0, 0, 0, 2, 0, 4, 4, 4, 0, 8, 8, 8, 8, 8, 8, 8 }; + int top = hibit[i]; + if (top != 0) { + m[i][0] = m[top][0] ^ m[i - top][0]; + m[i][1] = m[top][1] ^ m[i - top][1]; + } + } + + XMEMSET(poly->s, 0, sizeof(poly->s)); +} + +#else /* word32: GCM_WORD32 or no 64-bit type */ + +/* Load/store a big-endian word32 - byte 0 is the most-significant byte. Same + * aligned/unaligned split as AesGcmSivLoad64/Store64; both forms are endian + * independent. */ +#ifndef WOLFSSL_USE_ALIGN +static WC_INLINE word32 AesGcmSivLoad32(const byte* b) +{ + word32 v; + XMEMCPY(&v, b, sizeof(v)); +#ifdef LITTLE_ENDIAN_ORDER + v = ByteReverseWord32(v); +#endif + return v; +} +static WC_INLINE void AesGcmSivStore32(byte* b, word32 v) +{ +#ifdef LITTLE_ENDIAN_ORDER + v = ByteReverseWord32(v); +#endif + XMEMCPY(b, &v, sizeof(v)); +} +#else +static WC_INLINE word32 AesGcmSivLoad32(const byte* b) +{ + return ((word32)b[0] << 24) | ((word32)b[1] << 16) | + ((word32)b[2] << 8) | ((word32)b[3]); +} +static WC_INLINE void AesGcmSivStore32(byte* b, word32 v) +{ + b[0] = (byte)(v >> 24); b[1] = (byte)(v >> 16); + b[2] = (byte)(v >> 8); b[3] = (byte)(v); +} +#endif + +/* Multiply the 128-bit value (z[0] most significant) by x and reduce: shift + * the whole value right by one, XOR-ing 0xe1 into byte 0 when a one is shifted + * out past x^127 (the low bit of z[3]). */ +static WC_INLINE void AesGcmSivMulX32(word32* z) +{ + word32 carry = z[3] & 1; + z[3] = (z[3] >> 1) | (z[2] << 31); + z[2] = (z[2] >> 1) | (z[1] << 31); + z[1] = (z[1] >> 1) | (z[0] << 31); + z[0] = (z[0] >> 1) ^ (0xe1000000U & (word32)(0 - carry)); +} + +/* s = s * H. The accumulator is shifted right a nibble at a time; the nibble + * that falls off is reduced through AES_GCM_SIV_R into the top two bytes. */ +static void AesGcmSivGMult(AesGcmSivPolyval* poly) +{ + byte* x = poly->s; + word32 (*m)[4] = poly->m; + word32 z0 = 0, z1 = 0, z2 = 0, z3 = 0; + int i; + + for (i = WC_AES_BLOCK_SIZE - 1; i >= 0; i--) { + word32* mr; + byte xi = x[i]; + byte a; + + /* low nibble */ + mr = m[xi & 0xf]; + z0 ^= mr[0]; z1 ^= mr[1]; z2 ^= mr[2]; z3 ^= mr[3]; + a = (byte)(z3 & 0xf); + z3 = (z3 >> 4) | (z2 << 28); + z2 = (z2 >> 4) | (z1 << 28); + z1 = (z1 >> 4) | (z0 << 28); + z0 = z0 >> 4; + z0 ^= ((word32)AES_GCM_SIV_R[a][0] << 24) | + ((word32)AES_GCM_SIV_R[a][1] << 16); + + /* high nibble */ + mr = m[xi >> 4]; + z0 ^= mr[0]; z1 ^= mr[1]; z2 ^= mr[2]; z3 ^= mr[3]; + if (i == 0) { + break; + } + a = (byte)(z3 & 0xf); + z3 = (z3 >> 4) | (z2 << 28); + z2 = (z2 >> 4) | (z1 << 28); + z1 = (z1 >> 4) | (z0 << 28); + z0 = z0 >> 4; + z0 ^= ((word32)AES_GCM_SIV_R[a][0] << 24) | + ((word32)AES_GCM_SIV_R[a][1] << 16); + } + + AesGcmSivStore32(x, z0); + AesGcmSivStore32(x + 4, z1); + AesGcmSivStore32(x + 8, z2); + AesGcmSivStore32(x + 12, z3); +} + +/* Build the 4-bit table for mulX_GHASH(ByteReverse(h)). */ +static void AesGcmSivPolyvalInitSw(AesGcmSivPolyval* poly, const byte* h) +{ + byte hrev[WC_AES_BLOCK_SIZE]; + word32 (*m)[4] = poly->m; + int i; + + /* m[8] = 1 * H = mulX_GHASH(ByteReverse(h)); successive halvings give the + * power-of-two nibble entries. */ + AesGcmSivByteReverse(hrev, h); + m[0x8][0] = AesGcmSivLoad32(hrev); + m[0x8][1] = AesGcmSivLoad32(hrev + 4); + m[0x8][2] = AesGcmSivLoad32(hrev + 8); + m[0x8][3] = AesGcmSivLoad32(hrev + 12); + AesGcmSivMulX32(m[0x8]); + XMEMCPY(m[0x4], m[0x8], sizeof(m[0x4])); AesGcmSivMulX32(m[0x4]); + XMEMCPY(m[0x2], m[0x4], sizeof(m[0x2])); AesGcmSivMulX32(m[0x2]); + XMEMCPY(m[0x1], m[0x2], sizeof(m[0x1])); AesGcmSivMulX32(m[0x1]); + + /* The rest are sums of those basis entries (i = high bit + remainder). */ + m[0x0][0] = 0; m[0x0][1] = 0; m[0x0][2] = 0; m[0x0][3] = 0; + for (i = 0; i < 16; i++) { + static const byte hibit[16] = + { 0, 0, 0, 2, 0, 4, 4, 4, 0, 8, 8, 8, 8, 8, 8, 8 }; + int top = hibit[i]; + if (top != 0) { + m[i][0] = m[top][0] ^ m[i - top][0]; + m[i][1] = m[top][1] ^ m[i - top][1]; + m[i][2] = m[top][2] ^ m[i - top][2]; + m[i][3] = m[top][3] ^ m[i - top][3]; + } + } + + XMEMSET(poly->s, 0, sizeof(poly->s)); +} + +#endif /* POLYVAL multiply variant */ + +#ifdef WC_POLYVAL_ASM_THUMB2 +/* Thumb-2: the single table POLYVAL variant. */ +static AesGcmSivPolyvalFn AesGcmSivPolyvalAsm(void) +{ + return &AES_GCMSIV_polyval_thumb2; +} +#elif defined(WC_POLYVAL_ASM_AARCH32) +/* AArch32: crypto (vmull.p64) or base (table) POLYVAL, chosen at compile time. */ +static AesGcmSivPolyvalFn AesGcmSivPolyvalAsm(void) +{ +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + return &AES_GCMSIV_polyval_crypto; +#elif defined(WC_POLYVAL_ASM_AARCH32_BASE) + return &AES_GCMSIV_polyval_base; +#else + return NULL; +#endif +} +#elif defined(WC_POLYVAL_ASM_AARCH64) +/* Select the best available generated POLYVAL multiply: PMULL when the CPU has + * the crypto extension, else the 8-bit-pmul NEON variant, else the scalar base + * variant, else NULL to fall back to software. */ +static AesGcmSivPolyvalFn AesGcmSivPolyvalAsm(void) +{ +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + cpuid_get_flags_ex(&cpuid_flags); + if (IS_AARCH64_PMULL(cpuid_flags)) { + return &AES_GCMSIV_polyval_pmull; + } +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON + return &AES_GCMSIV_polyval_neon; +#elif defined(WC_POLYVAL_ASM_AARCH64_BASE) + return &AES_GCMSIV_polyval_base; +#else + return NULL; +#endif +} +#elif defined(WC_POLYVAL_ASM) +/* Select the best available generated POLYVAL multiply for this CPU, or NULL + * to fall back to software. PCLMUL is present on every AES-NI capable CPU, so + * AES-NI gates the base path (matching wolfSSL's AES-GCM). */ +static AesGcmSivPolyvalFn AesGcmSivPolyvalAsm(void) +{ + cpuid_get_flags_ex(&intel_flags); + if (!IS_INTEL_AESNI(intel_flags)) { + return NULL; + } +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + return &AES_GCMSIV_polyval_avx512; + } +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_VAES(intel_flags) && IS_INTEL_AVX2(intel_flags)) { + return &AES_GCMSIV_polyval_vaes; + } +#endif +#ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + return &AES_GCMSIV_polyval_avx1; + } +#endif + return &AES_GCMSIV_polyval_aesni; +} +#endif + +#ifdef WC_GCMSIV_CTR_ASM_THUMB2 +/* Thumb-2: the single table CTR variant. */ +static AesGcmSivCtrFn AesGcmSivCtrAsm(void) +{ + return &AES_GCMSIV_ctr_thumb2; +} +#elif defined(WC_GCMSIV_CTR_ASM_AARCH32) +/* AArch32: crypto (aese) or base (table) CTR, chosen at compile time. */ +static AesGcmSivCtrFn AesGcmSivCtrAsm(void) +{ +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + return &AES_GCMSIV_ctr_crypto; +#else + return &AES_GCMSIV_ctr_base; +#endif +} +#elif defined(WC_GCMSIV_CTR_ASM_AARCH64) +/* Select the best generated CTR keystream: pipelined aese when the CPU has the + * AES extension, else the NEON or base software-table variant, else NULL. */ +static AesGcmSivCtrFn AesGcmSivCtrAsm(void) +{ +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + cpuid_get_flags_ex(&cpuid_flags); + if (IS_AARCH64_AES(cpuid_flags)) { + return &AES_GCMSIV_ctr_aarch64; + } +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON + return &AES_GCMSIV_ctr_neon; +#elif !defined(WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP) + return &AES_GCMSIV_ctr_base; +#else + return NULL; +#endif +} +#elif defined(WC_GCMSIV_CTR_ASM) +/* Select the best generated AES-GCM-SIV CTR keystream for this CPU. AES-NI is + * the base; AVX1/VAES/AVX512 are progressively wider pipelines. */ +static AesGcmSivCtrFn AesGcmSivCtrAsm(void) +{ + cpuid_get_flags_ex(&intel_flags); + if (!IS_INTEL_AESNI(intel_flags)) { + return NULL; + } +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + return &AES_GCMSIV_ctr_avx512; + } +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_VAES(intel_flags) && IS_INTEL_AVX2(intel_flags)) { + return &AES_GCMSIV_ctr_vaes; + } +#endif +#ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + return &AES_GCMSIV_ctr_avx1; + } +#endif + return &AES_GCMSIV_ctr_aesni; +} +#endif + +/* Initialize POLYVAL with the 16-byte hash key h, using the generated assembly + * multiply when the CPU supports it and a software variant otherwise. */ +static void AesGcmSivPolyvalInit(AesGcmSivPolyval* poly, const byte* h) +{ +#ifdef WC_POLYVAL_ASM + AesGcmSivPolyvalFn fn = AesGcmSivPolyvalAsm(); + if (fn != NULL) { +#if defined(WC_POLYVAL_ASM_AARCH64_BASE) || defined(WC_POLYVAL_ASM_AARCH32_BASE) + if (fn == &AES_GCMSIV_polyval_base) { + /* The scalar variant multiplies through the word64 software table, + * so build it and point the asm at it. */ + AesGcmSivPolyvalInitSw(poly, h); + poly->asmKey = (const byte*)poly->m; + poly->fn = fn; + return; + } +#endif +#ifdef WC_POLYVAL_ASM_THUMB2 + if (fn == &AES_GCMSIV_polyval_thumb2) { + /* Table variant: build the word64 software table and point at it. */ + AesGcmSivPolyvalInitSw(poly, h); + poly->asmKey = (const byte*)poly->m; + poly->fn = fn; + return; + } +#endif + { + byte t[WC_AES_BLOCK_SIZE]; + /* Prepare the hash key for the asm: byte-reversed + * mulX_GHASH(ByteReverse(h)). */ + AesGcmSivByteReverse(t, h); + AesGcmSivMulX(t); + AesGcmSivByteReverse(poly->hHw, t); + XMEMSET(poly->s, 0, sizeof(poly->s)); + poly->asmKey = poly->hHw; + poly->fn = fn; + } + return; + } + poly->fn = NULL; +#endif + AesGcmSivPolyvalInitSw(poly, h); +} + +/* Add data to the POLYVAL sum. A trailing partial block is zero-padded to a + * full block, which is exactly the padding RFC 8452 applies to the AAD and + * the plaintext independently. */ +static void AesGcmSivPolyvalUpdate(AesGcmSivPolyval* poly, const byte* data, + word32 sz) +{ + byte block[WC_AES_BLOCK_SIZE]; + byte rev[WC_AES_BLOCK_SIZE]; + int k; + +#ifdef WC_POLYVAL_ASM + if (poly->fn != NULL) { + word32 blocks = sz / WC_AES_BLOCK_SIZE; + word32 partial = sz % WC_AES_BLOCK_SIZE; + if (blocks > 0) { + poly->fn(poly->s, poly->asmKey, data, blocks); + data += blocks * WC_AES_BLOCK_SIZE; + } + if (partial > 0) { + XMEMSET(block, 0, sizeof(block)); + XMEMCPY(block, data, partial); + poly->fn(poly->s, poly->asmKey, block, 1); + } + return; + } +#endif + while (sz >= WC_AES_BLOCK_SIZE) { + AesGcmSivByteReverse(rev, data); + for (k = 0; k < WC_AES_BLOCK_SIZE; k++) { + poly->s[k] ^= rev[k]; + } + AesGcmSivGMult(poly); + data += WC_AES_BLOCK_SIZE; + sz -= WC_AES_BLOCK_SIZE; + } + if (sz > 0) { + XMEMSET(block, 0, sizeof(block)); + XMEMCPY(block, data, sz); + AesGcmSivByteReverse(rev, block); + for (k = 0; k < WC_AES_BLOCK_SIZE; k++) { + poly->s[k] ^= rev[k]; + } + AesGcmSivGMult(poly); + } +} + +/* Output the 16-byte POLYVAL result and wipe the key material and state. */ +static void AesGcmSivPolyvalFinal(AesGcmSivPolyval* poly, byte* out) +{ + AesGcmSivByteReverse(out, poly->s); + ForceZero(poly, sizeof(*poly)); +} + +/* Derive the message-authentication-key and message-encryption-key from the + * key-generating-key (loaded into kgk) and the nonce. See RFC 8452 Section 4. + * + * authKey is 16 bytes; encKey is keySz bytes (16 or 32). */ +static WARN_UNUSED_RESULT int AesGcmSivDeriveKeys(Aes* kgk, const byte* nonce, + word32 keySz, byte* authKey, byte* encKey) +{ + byte block[WC_AES_BLOCK_SIZE]; + byte out[WC_AES_BLOCK_SIZE]; + word32 ctr; + word32 encBlocks = keySz / 8; /* 2 for AES-128, 4 for AES-256 */ + int ret = 0; + + /* Each derivation block is: LE32(counter) || nonce(12 bytes). The low 8 + * bytes of each AES output are concatenated to form the derived keys. */ + XMEMCPY(block + 4, nonce, AES_GCM_SIV_NONCE_SZ); + + for (ctr = 0; ctr < 2; ctr++) { + block[0] = (byte)ctr; + block[1] = 0; block[2] = 0; block[3] = 0; + ret = wc_AesEncrypt(kgk, block, out); + if (ret != 0) + break; + XMEMCPY(authKey + ctr * 8, out, 8); + } + + for (ctr = 0; (ret == 0) && (ctr < encBlocks); ctr++) { + block[0] = (byte)(ctr + 2); + block[1] = 0; block[2] = 0; block[3] = 0; + ret = wc_AesEncrypt(kgk, block, out); + if (ret != 0) + break; + XMEMCPY(encKey + ctr * 8, out, 8); + } + + ForceZero(block, sizeof(block)); + ForceZero(out, sizeof(out)); + + return ret; +} + +/* Compute the AES-GCM-SIV tag over the AAD and plaintext. enc holds the + * message-encryption-key. See RFC 8452 Section 4. */ +static WARN_UNUSED_RESULT int AesGcmSivCalcTag(Aes* enc, const byte* authKey, + const byte* nonce, const byte* aad, word32 aadSz, const byte* plain, + word32 plainSz, byte* tag) +{ + AesGcmSivPolyval poly; + byte lenBlock[WC_AES_BLOCK_SIZE]; + byte s[WC_AES_BLOCK_SIZE]; + /* Bit lengths (sz * 8) as 64-bit values, computed without needing a + * 64-bit type: low 32 bits and the 3 bits that carry into the next word. */ + word32 aadLo = aadSz << 3, aadHi = aadSz >> 29; + word32 ptLo = plainSz << 3, ptHi = plainSz >> 29; + int i; + int ret; + + AesGcmSivPolyvalInit(&poly, authKey); + AesGcmSivPolyvalUpdate(&poly, aad, aadSz); + AesGcmSivPolyvalUpdate(&poly, plain, plainSz); + + /* Length block: LE64(aad_bits) || LE64(plaintext_bits). */ + lenBlock[0] = (byte)aadLo; lenBlock[1] = (byte)(aadLo >> 8); + lenBlock[2] = (byte)(aadLo >> 16); lenBlock[3] = (byte)(aadLo >> 24); + lenBlock[4] = (byte)aadHi; lenBlock[5] = (byte)(aadHi >> 8); + lenBlock[6] = (byte)(aadHi >> 16); lenBlock[7] = (byte)(aadHi >> 24); + lenBlock[8] = (byte)ptLo; lenBlock[9] = (byte)(ptLo >> 8); + lenBlock[10] = (byte)(ptLo >> 16); lenBlock[11] = (byte)(ptLo >> 24); + lenBlock[12] = (byte)ptHi; lenBlock[13] = (byte)(ptHi >> 8); + lenBlock[14] = (byte)(ptHi >> 16); lenBlock[15] = (byte)(ptHi >> 24); + AesGcmSivPolyvalUpdate(&poly, lenBlock, WC_AES_BLOCK_SIZE); + + AesGcmSivPolyvalFinal(&poly, s); + + /* XOR the nonce into the first 12 bytes and clear the top bit of the + * last byte, then encrypt to produce the tag. */ + for (i = 0; i < AES_GCM_SIV_NONCE_SZ; i++) { + s[i] ^= nonce[i]; + } + s[WC_AES_BLOCK_SIZE - 1] &= 0x7f; + + ret = wc_AesEncrypt(enc, s, tag); + + ForceZero(s, sizeof(s)); + return ret; +} + +/* Apply AES-GCM-SIV's counter mode to in, producing out. enc holds the + * message-encryption-key, tag is the 16-byte authentication tag. The counter + * is the tag with the top bit of the last byte set; only the first 4 bytes + * are incremented, as a little-endian 32-bit value, wrapping modulo 2^32. + * See RFC 8452 Section 4. */ +static WARN_UNUSED_RESULT int AesGcmSivCtr(Aes* enc, const byte* tag, + const byte* in, word32 sz, byte* out) +{ + byte ctrBlock[WC_AES_BLOCK_SIZE]; + byte ks[WC_AES_BLOCK_SIZE]; + word32 c; + int ret = 0; + + XMEMCPY(ctrBlock, tag, WC_AES_BLOCK_SIZE); + ctrBlock[WC_AES_BLOCK_SIZE - 1] |= 0x80; + +#ifdef WC_GCMSIV_CTR_ASM + /* Offload the full-block keystream to the pipelined assembly; it advances + * and writes ctrBlock back. The final partial block (if any) is finished by + * the scalar loop below. */ + { + AesGcmSivCtrFn fn = AesGcmSivCtrAsm(); + if (fn != NULL) { + word32 full = sz & ~(word32)(WC_AES_BLOCK_SIZE - 1); + if (full > 0) { + fn(in, out, (unsigned long)full, (const byte*)enc->key, + (int)enc->rounds, ctrBlock); + in += full; + out += full; + sz -= full; + } + } + } +#endif + + c = (word32)ctrBlock[0] | ((word32)ctrBlock[1] << 8) | + ((word32)ctrBlock[2] << 16) | ((word32)ctrBlock[3] << 24); + + while (sz > 0) { + word32 n = (sz < WC_AES_BLOCK_SIZE) ? sz : (word32)WC_AES_BLOCK_SIZE; + word32 i; + + ret = wc_AesEncrypt(enc, ctrBlock, ks); + if (ret != 0) + break; + for (i = 0; i < n; i++) { + out[i] = (byte)(in[i] ^ ks[i]); + } + + in += n; + out += n; + sz -= n; + + c++; + ctrBlock[0] = (byte)c; ctrBlock[1] = (byte)(c >> 8); + ctrBlock[2] = (byte)(c >> 16); ctrBlock[3] = (byte)(c >> 24); + } + + ForceZero(ks, sizeof(ks)); + ForceZero(ctrBlock, sizeof(ctrBlock)); + return ret; +} + +/* Common validation for the encrypt/decrypt entry points. */ +static WARN_UNUSED_RESULT int AesGcmSivCheckArgs(const byte* key, word32 keySz, + const byte* nonce, word32 nonceSz, const byte* aad, word32 aadSz, + const byte* in, word32 inSz, const byte* out, const byte* tag, + word32 tagSz) +{ + if (key == NULL || nonce == NULL || tag == NULL) { + return BAD_FUNC_ARG; + } + if ((inSz != 0) && ((in == NULL) || (out == NULL))) { + return BAD_FUNC_ARG; + } + if ((aadSz != 0) && (aad == NULL)) { + return BAD_FUNC_ARG; + } + if ((keySz != 16) && (keySz != 32)) { + return BAD_FUNC_ARG; + } + if (nonceSz != AES_GCM_SIV_NONCE_SZ) { + return BAD_FUNC_ARG; + } + if (tagSz != AES_GCM_SIV_TAG_SZ) { + return BAD_FUNC_ARG; + } + return 0; +} + +/* + * Encrypt with AES-GCM-SIV. See RFC 8452 Section 4. + * + * out receives inSz bytes of ciphertext; tag receives the 16-byte tag. + */ +int wc_AesGcmSivEncrypt(const byte* key, word32 keySz, const byte* nonce, + word32 nonceSz, const byte* aad, word32 aadSz, const byte* in, + word32 inSz, byte* out, byte* tag, word32 tagSz) +{ + WC_DECLARE_VAR(aes, Aes, 1, 0); + byte authKey[WC_AES_BLOCK_SIZE]; + byte encKey[32]; + byte tagTmp[AES_GCM_SIV_TAG_SZ]; + int ret; + + ret = AesGcmSivCheckArgs(key, keySz, nonce, nonceSz, aad, aadSz, in, inSz, + out, tag, tagSz); + + if (ret == 0) { + #ifdef WOLFSSL_SMALL_STACK + aes = wc_AesNew(NULL, INVALID_DEVID, &ret); + #else + ret = wc_AesInit(aes, NULL, INVALID_DEVID); + #endif + } + + if (ret == 0) { + /* Load the key-generating-key and derive the per-message keys. */ + ret = wc_AesSetKey(aes, key, keySz, NULL, AES_ENCRYPTION); + if (ret == 0) { + ret = AesGcmSivDeriveKeys(aes, nonce, keySz, authKey, encKey); + } + /* Switch the AES object to the message-encryption-key. */ + if (ret == 0) { + ret = wc_AesSetKey(aes, encKey, keySz, NULL, AES_ENCRYPTION); + } + /* Tag is computed over the plaintext, then the plaintext is + * encrypted with the tag-derived counter. */ + if (ret == 0) { + ret = AesGcmSivCalcTag(aes, authKey, nonce, aad, aadSz, in, inSz, + tagTmp); + } + if (ret == 0) { + ret = AesGcmSivCtr(aes, tagTmp, in, inSz, out); + } + if (ret == 0) { + XMEMCPY(tag, tagTmp, AES_GCM_SIV_TAG_SZ); + } + + #ifdef WOLFSSL_SMALL_STACK + wc_AesDelete(aes, NULL); + #else + wc_AesFree(aes); + #endif + } + + ForceZero(authKey, sizeof(authKey)); + ForceZero(encKey, sizeof(encKey)); + ForceZero(tagTmp, sizeof(tagTmp)); + + return ret; +} + +/* + * Decrypt with AES-GCM-SIV. See RFC 8452 Section 4. + * + * in is inSz bytes of ciphertext, tag is the received 16-byte tag. On a + * successful authentication out receives inSz bytes of plaintext; on failure + * out is zeroed and AES_GCM_AUTH_E is returned. + */ +int wc_AesGcmSivDecrypt(const byte* key, word32 keySz, const byte* nonce, + word32 nonceSz, const byte* aad, word32 aadSz, const byte* in, + word32 inSz, byte* out, const byte* tag, word32 tagSz) +{ + WC_DECLARE_VAR(aes, Aes, 1, 0); + byte authKey[WC_AES_BLOCK_SIZE]; + byte encKey[32]; + byte expTag[AES_GCM_SIV_TAG_SZ]; + int ret; + + ret = AesGcmSivCheckArgs(key, keySz, nonce, nonceSz, aad, aadSz, in, inSz, + out, tag, tagSz); + + if (ret == 0) { + #ifdef WOLFSSL_SMALL_STACK + aes = wc_AesNew(NULL, INVALID_DEVID, &ret); + #else + ret = wc_AesInit(aes, NULL, INVALID_DEVID); + #endif + } + + if (ret == 0) { + ret = wc_AesSetKey(aes, key, keySz, NULL, AES_ENCRYPTION); + if (ret == 0) { + ret = AesGcmSivDeriveKeys(aes, nonce, keySz, authKey, encKey); + } + if (ret == 0) { + ret = wc_AesSetKey(aes, encKey, keySz, NULL, AES_ENCRYPTION); + } + /* Recover the plaintext, then recompute and verify the tag over it. */ + if (ret == 0) { + ret = AesGcmSivCtr(aes, tag, in, inSz, out); + } + if (ret == 0) { + ret = AesGcmSivCalcTag(aes, authKey, nonce, aad, aadSz, out, inSz, + expTag); + } + if (ret == 0) { + if (ConstantCompare(expTag, tag, AES_GCM_SIV_TAG_SZ) != 0) { + ret = AES_GCM_AUTH_E; + } + } + if (ret != 0) { + ForceZero(out, inSz); + } + + #ifdef WOLFSSL_SMALL_STACK + wc_AesDelete(aes, NULL); + #else + wc_AesFree(aes); + #endif + } + + ForceZero(authKey, sizeof(authKey)); + ForceZero(encKey, sizeof(encKey)); + ForceZero(expTag, sizeof(expTag)); + + return ret; +} + +#endif /* WOLFSSL_AESGCM_SIV */ + #if defined(WOLFSSL_AES_EAX) /* diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index c1c357e19b..6ce4bf762d 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -6486,6 +6486,569 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_done: .size AES_GCM_decrypt_final_aesni,.-AES_GCM_decrypt_final_aesni #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcm_siv_bswap_mask: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_aesni +.type AES_GCMSIV_polyval_aesni,@function +.align 16 +AES_GCMSIV_polyval_aesni: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_aesni +.p2align 4 +_AES_GCMSIV_polyval_aesni: +#endif /* __APPLE__ */ + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm0 + pshufb L_aes_gcm_siv_bswap_mask(%rip), %xmm0 + movdqa %xmm1, %xmm9 + pclmulqdq $0x00, %xmm1, %xmm9 + movdqa %xmm1, %xmm10 + pclmulqdq $0x11, %xmm1, %xmm10 + movdqa %xmm1, %xmm13 + pclmulqdq $16, %xmm1, %xmm13 + movdqa %xmm1, %xmm14 + pclmulqdq $0x01, %xmm1, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm9 + pxor %xmm13, %xmm10 + movdqa %xmm9, %xmm5 + psrld $31, %xmm5 + movdqa %xmm10, %xmm6 + psrld $31, %xmm6 + pslld $0x01, %xmm9 + pslld $0x01, %xmm10 + movdqa %xmm5, %xmm7 + psrldq $12, %xmm7 + pslldq $4, %xmm6 + pslldq $4, %xmm5 + por %xmm5, %xmm9 + por %xmm6, %xmm10 + por %xmm7, %xmm10 + movdqa %xmm9, %xmm5 + pslld $31, %xmm5 + movdqa %xmm9, %xmm6 + pslld $30, %xmm6 + movdqa %xmm9, %xmm7 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + psrldq $4, %xmm6 + pslldq $12, %xmm5 + pxor %xmm5, %xmm9 + movdqa %xmm9, %xmm8 + psrld $0x01, %xmm8 + movdqa %xmm9, %xmm11 + psrld $2, %xmm11 + movdqa %xmm9, %xmm12 + psrld $7, %xmm12 + pxor %xmm11, %xmm8 + pxor %xmm12, %xmm8 + pxor %xmm6, %xmm8 + pxor %xmm8, %xmm9 + pxor %xmm9, %xmm10 + movdqa %xmm10, %xmm2 + movdqa %xmm2, %xmm9 + pclmulqdq $0x00, %xmm1, %xmm9 + movdqa %xmm2, %xmm10 + pclmulqdq $0x11, %xmm1, %xmm10 + movdqa %xmm2, %xmm13 + pclmulqdq $16, %xmm1, %xmm13 + movdqa %xmm2, %xmm14 + pclmulqdq $0x01, %xmm1, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm9 + pxor %xmm13, %xmm10 + movdqa %xmm9, %xmm5 + psrld $31, %xmm5 + movdqa %xmm10, %xmm6 + psrld $31, %xmm6 + pslld $0x01, %xmm9 + pslld $0x01, %xmm10 + movdqa %xmm5, %xmm7 + psrldq $12, %xmm7 + pslldq $4, %xmm6 + pslldq $4, %xmm5 + por %xmm5, %xmm9 + por %xmm6, %xmm10 + por %xmm7, %xmm10 + movdqa %xmm9, %xmm5 + pslld $31, %xmm5 + movdqa %xmm9, %xmm6 + pslld $30, %xmm6 + movdqa %xmm9, %xmm7 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + psrldq $4, %xmm6 + pslldq $12, %xmm5 + pxor %xmm5, %xmm9 + movdqa %xmm9, %xmm8 + psrld $0x01, %xmm8 + movdqa %xmm9, %xmm11 + psrld $2, %xmm11 + movdqa %xmm9, %xmm12 + psrld $7, %xmm12 + pxor %xmm11, %xmm8 + pxor %xmm12, %xmm8 + pxor %xmm6, %xmm8 + pxor %xmm8, %xmm9 + pxor %xmm9, %xmm10 + movdqa %xmm10, %xmm3 + movdqa %xmm2, %xmm9 + pclmulqdq $0x00, %xmm2, %xmm9 + movdqa %xmm2, %xmm10 + pclmulqdq $0x11, %xmm2, %xmm10 + movdqa %xmm2, %xmm13 + pclmulqdq $16, %xmm2, %xmm13 + movdqa %xmm2, %xmm14 + pclmulqdq $0x01, %xmm2, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm9 + pxor %xmm13, %xmm10 + movdqa %xmm9, %xmm5 + psrld $31, %xmm5 + movdqa %xmm10, %xmm6 + psrld $31, %xmm6 + pslld $0x01, %xmm9 + pslld $0x01, %xmm10 + movdqa %xmm5, %xmm7 + psrldq $12, %xmm7 + pslldq $4, %xmm6 + pslldq $4, %xmm5 + por %xmm5, %xmm9 + por %xmm6, %xmm10 + por %xmm7, %xmm10 + movdqa %xmm9, %xmm5 + pslld $31, %xmm5 + movdqa %xmm9, %xmm6 + pslld $30, %xmm6 + movdqa %xmm9, %xmm7 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + psrldq $4, %xmm6 + pslldq $12, %xmm5 + pxor %xmm5, %xmm9 + movdqa %xmm9, %xmm8 + psrld $0x01, %xmm8 + movdqa %xmm9, %xmm11 + psrld $2, %xmm11 + movdqa %xmm9, %xmm12 + psrld $7, %xmm12 + pxor %xmm11, %xmm8 + pxor %xmm12, %xmm8 + pxor %xmm6, %xmm8 + pxor %xmm8, %xmm9 + pxor %xmm9, %xmm10 + movdqa %xmm10, %xmm4 + movl %ecx, %r8d + shll $4, %r8d + movl %r8d, %r9d + andl $0xffffffc0, %r9d + xorl %eax, %eax + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_aesni_four_done +L_AES_GCMSIV_polyval_aesni_four: + leaq (%rdx,%rax,1), %r10 + movdqu (%r10), %xmm5 + movdqu 16(%r10), %xmm6 + movdqu 32(%r10), %xmm7 + movdqu 48(%r10), %xmm8 + pxor %xmm0, %xmm5 + movdqa %xmm5, %xmm9 + pclmulqdq $0x00, %xmm4, %xmm9 + movdqa %xmm5, %xmm10 + pclmulqdq $0x11, %xmm4, %xmm10 + movdqa %xmm5, %xmm13 + pclmulqdq $16, %xmm4, %xmm13 + movdqa %xmm5, %xmm14 + pclmulqdq $0x01, %xmm4, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm9 + pxor %xmm13, %xmm10 + movdqa %xmm6, %xmm11 + pclmulqdq $0x00, %xmm3, %xmm11 + movdqa %xmm6, %xmm12 + pclmulqdq $0x11, %xmm3, %xmm12 + movdqa %xmm6, %xmm13 + pclmulqdq $16, %xmm3, %xmm13 + movdqa %xmm6, %xmm14 + pclmulqdq $0x01, %xmm3, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm11 + pxor %xmm13, %xmm12 + pxor %xmm11, %xmm9 + pxor %xmm12, %xmm10 + movdqa %xmm7, %xmm11 + pclmulqdq $0x00, %xmm2, %xmm11 + movdqa %xmm7, %xmm12 + pclmulqdq $0x11, %xmm2, %xmm12 + movdqa %xmm7, %xmm13 + pclmulqdq $16, %xmm2, %xmm13 + movdqa %xmm7, %xmm14 + pclmulqdq $0x01, %xmm2, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm11 + pxor %xmm13, %xmm12 + pxor %xmm11, %xmm9 + pxor %xmm12, %xmm10 + movdqa %xmm8, %xmm11 + pclmulqdq $0x00, %xmm1, %xmm11 + movdqa %xmm8, %xmm12 + pclmulqdq $0x11, %xmm1, %xmm12 + movdqa %xmm8, %xmm13 + pclmulqdq $16, %xmm1, %xmm13 + movdqa %xmm8, %xmm14 + pclmulqdq $0x01, %xmm1, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm11 + pxor %xmm13, %xmm12 + pxor %xmm11, %xmm9 + pxor %xmm12, %xmm10 + movdqa %xmm9, %xmm5 + psrld $31, %xmm5 + movdqa %xmm10, %xmm6 + psrld $31, %xmm6 + pslld $0x01, %xmm9 + pslld $0x01, %xmm10 + movdqa %xmm5, %xmm7 + psrldq $12, %xmm7 + pslldq $4, %xmm6 + pslldq $4, %xmm5 + por %xmm5, %xmm9 + por %xmm6, %xmm10 + por %xmm7, %xmm10 + movdqa %xmm9, %xmm5 + pslld $31, %xmm5 + movdqa %xmm9, %xmm6 + pslld $30, %xmm6 + movdqa %xmm9, %xmm7 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + psrldq $4, %xmm6 + pslldq $12, %xmm5 + pxor %xmm5, %xmm9 + movdqa %xmm9, %xmm8 + psrld $0x01, %xmm8 + movdqa %xmm9, %xmm11 + psrld $2, %xmm11 + movdqa %xmm9, %xmm12 + psrld $7, %xmm12 + pxor %xmm11, %xmm8 + pxor %xmm12, %xmm8 + pxor %xmm6, %xmm8 + pxor %xmm8, %xmm9 + pxor %xmm9, %xmm10 + movdqa %xmm10, %xmm0 + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_aesni_four +L_AES_GCMSIV_polyval_aesni_four_done: +L_AES_GCMSIV_polyval_aesni_rem: + cmpl %r8d, %eax + jge L_AES_GCMSIV_polyval_aesni_done + pxor (%rdx,%rax,1), %xmm0 + movdqa %xmm0, %xmm9 + pclmulqdq $0x00, %xmm1, %xmm9 + movdqa %xmm0, %xmm10 + pclmulqdq $0x11, %xmm1, %xmm10 + movdqa %xmm0, %xmm13 + pclmulqdq $16, %xmm1, %xmm13 + movdqa %xmm0, %xmm14 + pclmulqdq $0x01, %xmm1, %xmm14 + pxor %xmm14, %xmm13 + movdqa %xmm13, %xmm14 + pslldq $8, %xmm14 + psrldq $8, %xmm13 + pxor %xmm14, %xmm9 + pxor %xmm13, %xmm10 + movdqa %xmm9, %xmm5 + psrld $31, %xmm5 + movdqa %xmm10, %xmm6 + psrld $31, %xmm6 + pslld $0x01, %xmm9 + pslld $0x01, %xmm10 + movdqa %xmm5, %xmm7 + psrldq $12, %xmm7 + pslldq $4, %xmm6 + pslldq $4, %xmm5 + por %xmm5, %xmm9 + por %xmm6, %xmm10 + por %xmm7, %xmm10 + movdqa %xmm9, %xmm5 + pslld $31, %xmm5 + movdqa %xmm9, %xmm6 + pslld $30, %xmm6 + movdqa %xmm9, %xmm7 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + psrldq $4, %xmm6 + pslldq $12, %xmm5 + pxor %xmm5, %xmm9 + movdqa %xmm9, %xmm8 + psrld $0x01, %xmm8 + movdqa %xmm9, %xmm11 + psrld $2, %xmm11 + movdqa %xmm9, %xmm12 + psrld $7, %xmm12 + pxor %xmm11, %xmm8 + pxor %xmm12, %xmm8 + pxor %xmm6, %xmm8 + pxor %xmm8, %xmm9 + pxor %xmm9, %xmm10 + movdqa %xmm10, %xmm0 + addl $16, %eax + jmp L_AES_GCMSIV_polyval_aesni_rem +L_AES_GCMSIV_polyval_aesni_done: + pshufb L_aes_gcm_siv_bswap_mask(%rip), %xmm0 + movdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_polyval_aesni,.-AES_GCMSIV_polyval_aesni +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcmsiv_ctr_aesni_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_aesni +.type AES_GCMSIV_ctr_aesni,@function +.align 16 +AES_GCMSIV_ctr_aesni: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_aesni +.p2align 4 +_AES_GCMSIV_ctr_aesni: +#endif /* __APPLE__ */ + pushq %rbx + movdqu L_aes_gcmsiv_ctr_aesni_one(%rip), %xmm8 + movdqu (%r9), %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_GCMSIV_ctr_aesni_done_64 + andl $0xffffffc0, %r10d +L_AES_GCMSIV_ctr_aesni_enc_64: + # 64 bytes of input + # siv_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + movdqa %xmm7, %xmm0 + paddd %xmm8, %xmm7 + movdqa %xmm7, %xmm1 + paddd %xmm8, %xmm7 + movdqa %xmm7, %xmm2 + paddd %xmm8, %xmm7 + movdqa %xmm7, %xmm3 + paddd %xmm8, %xmm7 + # aes_enc_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last: + aesenclast %xmm4, %xmm0 + aesenclast %xmm4, %xmm1 + aesenclast %xmm4, %xmm2 + aesenclast %xmm4, %xmm3 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + movdqu 16(%r11), %xmm4 + pxor %xmm4, %xmm1 + movdqu 32(%r11), %xmm4 + pxor %xmm4, %xmm2 + movdqu 48(%r11), %xmm4 + pxor %xmm4, %xmm3 + movdqu %xmm0, (%rbx) + movdqu %xmm1, 16(%rbx) + movdqu %xmm2, 32(%rbx) + movdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_aesni_enc_64 +L_AES_GCMSIV_ctr_aesni_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_GCMSIV_ctr_aesni_done_enc + andl $0xfffffff0, %r10d +L_AES_GCMSIV_ctr_aesni_enc_16: + # 16 bytes of input + movdqa %xmm7, %xmm0 + paddd %xmm8, %xmm7 + # aes_enc_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + leaq (%rdi,%rax,1), %r11 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_aesni_enc_16 +L_AES_GCMSIV_ctr_aesni_done_enc: + movdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_ctr_aesni,.-AES_GCMSIV_ctr_aesni +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #ifdef HAVE_INTEL_AVX1 #ifndef __APPLE__ .data @@ -11948,6 +12511,469 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_done: .size AES_GCM_decrypt_final_avx1,.-AES_GCM_decrypt_final_avx1 #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcm_siv_bswap_mask_avx1: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_avx1 +.type AES_GCMSIV_polyval_avx1,@function +.align 16 +AES_GCMSIV_polyval_avx1: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_avx1 +.p2align 4 +_AES_GCMSIV_polyval_avx1: +#endif /* __APPLE__ */ + vmovdqu (%rsi), %xmm1 + vmovdqu (%rdi), %xmm0 + vpshufb L_aes_gcm_siv_bswap_mask_avx1(%rip), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm5 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm6 + vpclmulqdq $16, %xmm1, %xmm1, %xmm10 + vpclmulqdq $0x01, %xmm1, %xmm1, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm5, %xmm5 + vpxor %xmm10, %xmm6, %xmm6 + vpsrld $31, %xmm5, %xmm10 + vpsrld $31, %xmm6, %xmm11 + vpslld $0x01, %xmm5, %xmm5 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm10, %xmm12 + vpslldq $4, %xmm11, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpor %xmm10, %xmm5, %xmm5 + vpor %xmm11, %xmm6, %xmm6 + vpor %xmm12, %xmm6, %xmm6 + vpslld $31, %xmm5, %xmm10 + vpslld $30, %xmm5, %xmm11 + vpslld $25, %xmm5, %xmm12 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm12, %xmm10, %xmm10 + vpsrldq $4, %xmm10, %xmm11 + vpslldq $12, %xmm10, %xmm10 + vpxor %xmm10, %xmm5, %xmm5 + vpsrld $0x01, %xmm5, %xmm7 + vpsrld $2, %xmm5, %xmm8 + vpsrld $7, %xmm5, %xmm9 + vpxor %xmm8, %xmm7, %xmm7 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm11, %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm2 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm6 + vpclmulqdq $16, %xmm1, %xmm2, %xmm10 + vpclmulqdq $0x01, %xmm1, %xmm2, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm5, %xmm5 + vpxor %xmm10, %xmm6, %xmm6 + vpsrld $31, %xmm5, %xmm10 + vpsrld $31, %xmm6, %xmm11 + vpslld $0x01, %xmm5, %xmm5 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm10, %xmm12 + vpslldq $4, %xmm11, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpor %xmm10, %xmm5, %xmm5 + vpor %xmm11, %xmm6, %xmm6 + vpor %xmm12, %xmm6, %xmm6 + vpslld $31, %xmm5, %xmm10 + vpslld $30, %xmm5, %xmm11 + vpslld $25, %xmm5, %xmm12 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm12, %xmm10, %xmm10 + vpsrldq $4, %xmm10, %xmm11 + vpslldq $12, %xmm10, %xmm10 + vpxor %xmm10, %xmm5, %xmm5 + vpsrld $0x01, %xmm5, %xmm7 + vpsrld $2, %xmm5, %xmm8 + vpsrld $7, %xmm5, %xmm9 + vpxor %xmm8, %xmm7, %xmm7 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm11, %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm6 + vpclmulqdq $16, %xmm2, %xmm2, %xmm10 + vpclmulqdq $0x01, %xmm2, %xmm2, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm5, %xmm5 + vpxor %xmm10, %xmm6, %xmm6 + vpsrld $31, %xmm5, %xmm10 + vpsrld $31, %xmm6, %xmm11 + vpslld $0x01, %xmm5, %xmm5 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm10, %xmm12 + vpslldq $4, %xmm11, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpor %xmm10, %xmm5, %xmm5 + vpor %xmm11, %xmm6, %xmm6 + vpor %xmm12, %xmm6, %xmm6 + vpslld $31, %xmm5, %xmm10 + vpslld $30, %xmm5, %xmm11 + vpslld $25, %xmm5, %xmm12 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm12, %xmm10, %xmm10 + vpsrldq $4, %xmm10, %xmm11 + vpslldq $12, %xmm10, %xmm10 + vpxor %xmm10, %xmm5, %xmm5 + vpsrld $0x01, %xmm5, %xmm7 + vpsrld $2, %xmm5, %xmm8 + vpsrld $7, %xmm5, %xmm9 + vpxor %xmm8, %xmm7, %xmm7 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm11, %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm4 + movl %ecx, %r8d + shll $4, %r8d + movl %r8d, %r9d + andl $0xffffffc0, %r9d + xorl %eax, %eax + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_avx1_four_done +L_AES_GCMSIV_polyval_avx1_four: + leaq (%rdx,%rax,1), %r10 + vmovdqu (%r10), %xmm7 + vpxor %xmm0, %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm4, %xmm7, %xmm5 + vpclmulqdq $0x11, %xmm4, %xmm7, %xmm6 + vpclmulqdq $16, %xmm4, %xmm7, %xmm10 + vpclmulqdq $0x01, %xmm4, %xmm7, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm5, %xmm5 + vpxor %xmm10, %xmm6, %xmm6 + vmovdqu 16(%r10), %xmm7 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm9 + vpclmulqdq $16, %xmm3, %xmm7, %xmm10 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm8, %xmm5, %xmm5 + vpxor %xmm9, %xmm6, %xmm6 + vmovdqu 32(%r10), %xmm7 + vpclmulqdq $0x00, %xmm2, %xmm7, %xmm8 + vpclmulqdq $0x11, %xmm2, %xmm7, %xmm9 + vpclmulqdq $16, %xmm2, %xmm7, %xmm10 + vpclmulqdq $0x01, %xmm2, %xmm7, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm8, %xmm5, %xmm5 + vpxor %xmm9, %xmm6, %xmm6 + vmovdqu 48(%r10), %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm7, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm7, %xmm9 + vpclmulqdq $16, %xmm1, %xmm7, %xmm10 + vpclmulqdq $0x01, %xmm1, %xmm7, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm8, %xmm5, %xmm5 + vpxor %xmm9, %xmm6, %xmm6 + vpsrld $31, %xmm5, %xmm10 + vpsrld $31, %xmm6, %xmm11 + vpslld $0x01, %xmm5, %xmm5 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm10, %xmm12 + vpslldq $4, %xmm11, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpor %xmm10, %xmm5, %xmm5 + vpor %xmm11, %xmm6, %xmm6 + vpor %xmm12, %xmm6, %xmm6 + vpslld $31, %xmm5, %xmm10 + vpslld $30, %xmm5, %xmm11 + vpslld $25, %xmm5, %xmm12 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm12, %xmm10, %xmm10 + vpsrldq $4, %xmm10, %xmm11 + vpslldq $12, %xmm10, %xmm10 + vpxor %xmm10, %xmm5, %xmm5 + vpsrld $0x01, %xmm5, %xmm7 + vpsrld $2, %xmm5, %xmm8 + vpsrld $7, %xmm5, %xmm9 + vpxor %xmm8, %xmm7, %xmm7 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm11, %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_avx1_four +L_AES_GCMSIV_polyval_avx1_four_done: +L_AES_GCMSIV_polyval_avx1_rem: + cmpl %r8d, %eax + jge L_AES_GCMSIV_polyval_avx1_done + vpxor (%rdx,%rax,1), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm5 + vpclmulqdq $0x11, %xmm1, %xmm0, %xmm6 + vpclmulqdq $16, %xmm1, %xmm0, %xmm10 + vpclmulqdq $0x01, %xmm1, %xmm0, %xmm11 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm11 + vpsrldq $8, %xmm10, %xmm10 + vpxor %xmm11, %xmm5, %xmm5 + vpxor %xmm10, %xmm6, %xmm6 + vpsrld $31, %xmm5, %xmm10 + vpsrld $31, %xmm6, %xmm11 + vpslld $0x01, %xmm5, %xmm5 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm10, %xmm12 + vpslldq $4, %xmm11, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpor %xmm10, %xmm5, %xmm5 + vpor %xmm11, %xmm6, %xmm6 + vpor %xmm12, %xmm6, %xmm6 + vpslld $31, %xmm5, %xmm10 + vpslld $30, %xmm5, %xmm11 + vpslld $25, %xmm5, %xmm12 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm12, %xmm10, %xmm10 + vpsrldq $4, %xmm10, %xmm11 + vpslldq $12, %xmm10, %xmm10 + vpxor %xmm10, %xmm5, %xmm5 + vpsrld $0x01, %xmm5, %xmm7 + vpsrld $2, %xmm5, %xmm8 + vpsrld $7, %xmm5, %xmm9 + vpxor %xmm8, %xmm7, %xmm7 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm11, %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + addl $16, %eax + jmp L_AES_GCMSIV_polyval_avx1_rem +L_AES_GCMSIV_polyval_avx1_done: + vpshufb L_aes_gcm_siv_bswap_mask_avx1(%rip), %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_polyval_avx1,.-AES_GCMSIV_polyval_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcmsiv_ctr_avx1_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_avx1 +.type AES_GCMSIV_ctr_avx1,@function +.align 16 +AES_GCMSIV_ctr_avx1: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_avx1 +.p2align 4 +_AES_GCMSIV_ctr_avx1: +#endif /* __APPLE__ */ + pushq %rbx + vmovdqu L_aes_gcmsiv_ctr_avx1_one(%rip), %xmm8 + vmovdqu (%r9), %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_GCMSIV_ctr_avx1_done_64 + andl $0xffffffc0, %r10d +L_AES_GCMSIV_ctr_avx1_enc_64: + # 64 bytes of input + # siv_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vmovdqa %xmm7, %xmm0 + vpaddd %xmm8, %xmm7, %xmm7 + vmovdqa %xmm7, %xmm1 + vpaddd %xmm8, %xmm7, %xmm7 + vmovdqa %xmm7, %xmm2 + vpaddd %xmm8, %xmm7, %xmm7 + vmovdqa %xmm7, %xmm3 + vpaddd %xmm8, %xmm7, %xmm7 + # aes_enc_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm0, %xmm0 + vaesenclast %xmm4, %xmm1, %xmm1 + vaesenclast %xmm4, %xmm2, %xmm2 + vaesenclast %xmm4, %xmm3, %xmm3 + vpxor (%r11), %xmm0, %xmm0 + vpxor 16(%r11), %xmm1, %xmm1 + vpxor 32(%r11), %xmm2, %xmm2 + vpxor 48(%r11), %xmm3, %xmm3 + vmovdqu %xmm0, (%rbx) + vmovdqu %xmm1, 16(%rbx) + vmovdqu %xmm2, 32(%rbx) + vmovdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_avx1_enc_64 +L_AES_GCMSIV_ctr_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_GCMSIV_ctr_avx1_done_enc + andl $0xfffffff0, %r10d +L_AES_GCMSIV_ctr_avx1_enc_16: + # 16 bytes of input + vmovdqa %xmm7, %xmm0 + vpaddd %xmm8, %xmm7, %xmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_avx1_enc_16 +L_AES_GCMSIV_ctr_avx1_done_enc: + vmovdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_ctr_avx1,.-AES_GCMSIV_ctr_avx1 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* HAVE_INTEL_AVX1 */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ @@ -22461,6 +23487,808 @@ L_AES_GCM_decrypt_final_vaes_cmp_tag_done: .size AES_GCM_decrypt_final_vaes,.-AES_GCM_decrypt_final_vaes #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcm_siv_bswap_mask_vaes: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_vaes +.type AES_GCMSIV_polyval_vaes,@function +.align 16 +AES_GCMSIV_polyval_vaes: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_vaes +.p2align 4 +_AES_GCMSIV_polyval_vaes: +#endif /* __APPLE__ */ + vmovdqu (%rsi), %xmm1 + vmovdqu (%rdi), %xmm0 + vpshufb L_aes_gcm_siv_bswap_mask_vaes(%rip), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm14 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm15 + vpclmulqdq $16, %xmm1, %xmm1, %xmm13 + vpclmulqdq $0x01, %xmm1, %xmm1, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm2 + vinserti128 $0x01, %xmm1, %ymm2, %ymm2 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm14 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm15 + vpclmulqdq $16, %xmm2, %xmm2, %xmm13 + vpclmulqdq $0x01, %xmm2, %xmm2, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm14 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm15 + vpclmulqdq $16, %xmm1, %xmm2, %xmm13 + vpclmulqdq $0x01, %xmm1, %xmm2, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm12 + vinserti128 $0x01, %xmm12, %ymm3, %ymm3 + vpclmulqdq $0x00, %xmm2, %xmm3, %xmm14 + vpclmulqdq $0x11, %xmm2, %xmm3, %xmm15 + vpclmulqdq $16, %xmm2, %xmm3, %xmm13 + vpclmulqdq $0x01, %xmm2, %xmm3, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm4 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm14 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm15 + vpclmulqdq $16, %xmm1, %xmm3, %xmm13 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm13 + vinserti128 $0x01, %xmm13, %ymm4, %ymm4 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm14 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm15 + vpclmulqdq $16, %xmm3, %xmm3, %xmm13 + vpclmulqdq $0x01, %xmm3, %xmm3, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm5 + vpclmulqdq $0x00, %xmm12, %xmm3, %xmm14 + vpclmulqdq $0x11, %xmm12, %xmm3, %xmm15 + vpclmulqdq $16, %xmm12, %xmm3, %xmm13 + vpclmulqdq $0x01, %xmm12, %xmm3, %xmm6 + vpxor %xmm6, %xmm13, %xmm13 + vpslldq $8, %xmm13, %xmm6 + vpsrldq $8, %xmm13, %xmm13 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm13, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm13 + vinserti128 $0x01, %xmm13, %ymm5, %ymm5 + movl %ecx, %r8d + shll $4, %r8d + xorl %eax, %eax + movl %r8d, %r9d + andl $0xffffff80, %r9d + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_vaes_eight_done +L_AES_GCMSIV_polyval_vaes_eight: + leaq (%rdx,%rax,1), %r10 + vmovdqu (%r10), %ymm6 + vmovdqu 32(%r10), %ymm7 + vmovdqu 64(%r10), %ymm8 + vmovdqu 96(%r10), %ymm9 + vpxor %ymm12, %ymm12, %ymm12 + vinserti128 $0x00, %xmm0, %ymm12, %ymm12 + vpxor %ymm12, %ymm6, %ymm6 + vpclmulqdq $0x00, %ymm5, %ymm6, %ymm10 + vpclmulqdq $0x11, %ymm5, %ymm6, %ymm11 + vpclmulqdq $16, %ymm5, %ymm6, %ymm14 + vpclmulqdq $0x01, %ymm5, %ymm6, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm10, %ymm10 + vpxor %ymm14, %ymm11, %ymm11 + vpclmulqdq $0x00, %ymm4, %ymm7, %ymm12 + vpclmulqdq $0x11, %ymm4, %ymm7, %ymm13 + vpclmulqdq $16, %ymm4, %ymm7, %ymm14 + vpclmulqdq $0x01, %ymm4, %ymm7, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm12, %ymm12 + vpxor %ymm14, %ymm13, %ymm13 + vpxor %ymm12, %ymm10, %ymm10 + vpxor %ymm13, %ymm11, %ymm11 + vpclmulqdq $0x00, %ymm3, %ymm8, %ymm12 + vpclmulqdq $0x11, %ymm3, %ymm8, %ymm13 + vpclmulqdq $16, %ymm3, %ymm8, %ymm14 + vpclmulqdq $0x01, %ymm3, %ymm8, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm12, %ymm12 + vpxor %ymm14, %ymm13, %ymm13 + vpxor %ymm12, %ymm10, %ymm10 + vpxor %ymm13, %ymm11, %ymm11 + vpclmulqdq $0x00, %ymm2, %ymm9, %ymm12 + vpclmulqdq $0x11, %ymm2, %ymm9, %ymm13 + vpclmulqdq $16, %ymm2, %ymm9, %ymm14 + vpclmulqdq $0x01, %ymm2, %ymm9, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm12, %ymm12 + vpxor %ymm14, %ymm13, %ymm13 + vpxor %ymm12, %ymm10, %ymm10 + vpxor %ymm13, %ymm11, %ymm11 + vextracti128 $0x01, %ymm10, %xmm12 + vpxor %xmm12, %xmm10, %xmm14 + vextracti128 $0x01, %ymm11, %xmm12 + vpxor %xmm12, %xmm11, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm0 + addl $0x80, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_vaes_eight +L_AES_GCMSIV_polyval_vaes_eight_done: + movl %r8d, %r9d + andl $0xffffffc0, %r9d + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_vaes_four_done +L_AES_GCMSIV_polyval_vaes_four: + leaq (%rdx,%rax,1), %r10 + vmovdqu (%r10), %ymm6 + vmovdqu 32(%r10), %ymm7 + vpxor %ymm12, %ymm12, %ymm12 + vinserti128 $0x00, %xmm0, %ymm12, %ymm12 + vpxor %ymm12, %ymm6, %ymm6 + vpclmulqdq $0x00, %ymm3, %ymm6, %ymm10 + vpclmulqdq $0x11, %ymm3, %ymm6, %ymm11 + vpclmulqdq $16, %ymm3, %ymm6, %ymm14 + vpclmulqdq $0x01, %ymm3, %ymm6, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm10, %ymm10 + vpxor %ymm14, %ymm11, %ymm11 + vpclmulqdq $0x00, %ymm2, %ymm7, %ymm12 + vpclmulqdq $0x11, %ymm2, %ymm7, %ymm13 + vpclmulqdq $16, %ymm2, %ymm7, %ymm14 + vpclmulqdq $0x01, %ymm2, %ymm7, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm12, %ymm12 + vpxor %ymm14, %ymm13, %ymm13 + vpxor %ymm12, %ymm10, %ymm10 + vpxor %ymm13, %ymm11, %ymm11 + vextracti128 $0x01, %ymm10, %xmm12 + vpxor %xmm12, %xmm10, %xmm14 + vextracti128 $0x01, %ymm11, %xmm12 + vpxor %xmm12, %xmm11, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm0 + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_vaes_four +L_AES_GCMSIV_polyval_vaes_four_done: + movl %r8d, %r9d + andl $0xffffffe0, %r9d + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_vaes_pair_done +L_AES_GCMSIV_polyval_vaes_pair: + vmovdqu (%rdx,%rax,1), %ymm6 + vpxor %ymm12, %ymm12, %ymm12 + vinserti128 $0x00, %xmm0, %ymm12, %ymm12 + vpxor %ymm12, %ymm6, %ymm6 + vpclmulqdq $0x00, %ymm2, %ymm6, %ymm10 + vpclmulqdq $0x11, %ymm2, %ymm6, %ymm11 + vpclmulqdq $16, %ymm2, %ymm6, %ymm14 + vpclmulqdq $0x01, %ymm2, %ymm6, %ymm15 + vpxor %ymm15, %ymm14, %ymm14 + vpslldq $8, %ymm14, %ymm15 + vpsrldq $8, %ymm14, %ymm14 + vpxor %ymm15, %ymm10, %ymm10 + vpxor %ymm14, %ymm11, %ymm11 + vextracti128 $0x01, %ymm10, %xmm12 + vpxor %xmm12, %xmm10, %xmm14 + vextracti128 $0x01, %ymm11, %xmm12 + vpxor %xmm12, %xmm11, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm0 + addl $32, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_vaes_pair +L_AES_GCMSIV_polyval_vaes_pair_done: + cmpl %r8d, %eax + je L_AES_GCMSIV_polyval_vaes_done + vpxor (%rdx,%rax,1), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm14 + vpclmulqdq $0x11, %xmm1, %xmm0, %xmm15 + vpclmulqdq $16, %xmm1, %xmm0, %xmm12 + vpclmulqdq $0x01, %xmm1, %xmm0, %xmm6 + vpxor %xmm6, %xmm12, %xmm12 + vpslldq $8, %xmm12, %xmm6 + vpsrldq $8, %xmm12, %xmm12 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm12, %xmm15, %xmm15 + vpsrld $31, %xmm14, %xmm9 + vpsrld $31, %xmm15, %xmm10 + vpslld $0x01, %xmm14, %xmm14 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpor %xmm9, %xmm14, %xmm14 + vpor %xmm10, %xmm15, %xmm15 + vpor %xmm11, %xmm15, %xmm15 + vpslld $31, %xmm14, %xmm9 + vpslld $30, %xmm14, %xmm10 + vpslld $25, %xmm14, %xmm11 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm11, %xmm9, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + vpsrld $0x01, %xmm14, %xmm6 + vpsrld $2, %xmm14, %xmm7 + vpsrld $7, %xmm14, %xmm8 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm8, %xmm6, %xmm6 + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm6, %xmm14, %xmm14 + vpxor %xmm14, %xmm15, %xmm0 +L_AES_GCMSIV_polyval_vaes_done: + vpshufb L_aes_gcm_siv_bswap_mask_vaes(%rip), %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_polyval_vaes,.-AES_GCMSIV_polyval_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_gcmsiv_ctr_inc_vaes: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_vaes +.type AES_GCMSIV_ctr_vaes,@function +.align 16 +AES_GCMSIV_ctr_vaes: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_vaes +.p2align 4 +_AES_GCMSIV_ctr_vaes: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti128 (%r9), %ymm7 + vbroadcasti128 128+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm8 + vbroadcasti128 32+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm9 + vbroadcasti128 16+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm10 + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r10d + jl L_AES_GCMSIV_ctr_vaes_done_128 + andl $0xffffff80, %r10d + vpaddd 0+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm7, %ymm4 + vpaddd 32+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm7, %ymm5 + vpaddd 64+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm7, %ymm6 + vpaddd 96+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm7, %ymm7 +L_AES_GCMSIV_ctr_vaes_enc_128: + # 128 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vmovdqa %ymm4, %ymm0 + vmovdqa %ymm5, %ymm1 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vpaddd %ymm8, %ymm4, %ymm4 + vpaddd %ymm8, %ymm5, %ymm5 + vpaddd %ymm8, %ymm6, %ymm6 + vpaddd %ymm8, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm11 + vpxor %ymm11, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm11, %ymm2, %ymm2 + vpxor %ymm11, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm11 + jl L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm11 + jl L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vaesenc %ymm11, %ymm1, %ymm1 + vaesenc %ymm11, %ymm2, %ymm2 + vaesenc %ymm11, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm11 +L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last: + vaesenclast %ymm11, %ymm0, %ymm0 + vaesenclast %ymm11, %ymm1, %ymm1 + vaesenclast %ymm11, %ymm2, %ymm2 + vaesenclast %ymm11, %ymm3, %ymm3 + vpxor (%r11), %ymm0, %ymm0 + vpxor 32(%r11), %ymm1, %ymm1 + vpxor 64(%r11), %ymm2, %ymm2 + vpxor 96(%r11), %ymm3, %ymm3 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm1, 32(%rbx) + vmovdqu %ymm2, 64(%rbx) + vmovdqu %ymm3, 96(%rbx) + addl $0x80, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_vaes_enc_128 + vperm2i128 $0x00, %ymm4, %ymm4, %ymm7 +L_AES_GCMSIV_ctr_vaes_done_128: + movl %edx, %r10d + andl $0xffffffe0, %r10d + cmpl %r10d, %eax + je L_AES_GCMSIV_ctr_vaes_done_32 +L_AES_GCMSIV_ctr_vaes_enc_32: + # 32 bytes of input + # siv_aes_ctr_enc_32 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddd 0+L_aes_gcmsiv_ctr_inc_vaes(%rip), %ymm7, %ymm0 + vpaddd %ymm9, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm11 + vpxor %ymm11, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm11 + jl L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm11 + jl L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm11 + vaesenc %ymm11, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm11 +L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last: + vaesenclast %ymm11, %ymm0, %ymm0 + vpxor (%r11), %ymm0, %ymm0 + vmovdqu %ymm0, (%rbx) + addl $32, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_vaes_enc_32 +L_AES_GCMSIV_ctr_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_GCMSIV_ctr_vaes_done_enc + andl $0xfffffff0, %r10d +L_AES_GCMSIV_ctr_vaes_enc_16: + # 16 bytes of input + vmovdqa %xmm7, %xmm0 + vpaddd %ymm10, %ymm7, %ymm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_vaes_enc_16 +L_AES_GCMSIV_ctr_vaes_done_enc: + vmovdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_ctr_vaes,.-AES_GCMSIV_ctr_vaes +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* HAVE_INTEL_VAES */ #ifdef HAVE_INTEL_AVX512 #ifndef __APPLE__ @@ -30791,6 +32619,1005 @@ L_AES_GCM_decrypt_final_avx512_cmp_tag_done: .size AES_GCM_decrypt_final_avx512,.-AES_GCM_decrypt_final_avx512 #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_gcm_siv_bswap_mask_avx512: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_avx512 +.type AES_GCMSIV_polyval_avx512,@function +.align 16 +AES_GCMSIV_polyval_avx512: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_avx512 +.p2align 4 +_AES_GCMSIV_polyval_avx512: +#endif /* __APPLE__ */ + vmovdqu (%rsi), %xmm1 + vmovdqu (%rdi), %xmm0 + vpshufb L_aes_gcm_siv_bswap_mask_avx512(%rip), %xmm0, %xmm0 + vmovdqa64 %xmm1, %xmm20 + vinserti32x4 $3, %xmm20, %zmm2, %zmm2 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $2, %xmm20, %zmm2, %zmm2 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x01, %xmm20, %zmm2, %zmm2 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x00, %xmm20, %zmm2, %zmm2 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $3, %xmm20, %zmm3, %zmm3 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $2, %xmm20, %zmm3, %zmm3 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x01, %xmm20, %zmm3, %zmm3 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x00, %xmm20, %zmm3, %zmm3 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $3, %xmm20, %zmm4, %zmm4 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $2, %xmm20, %zmm4, %zmm4 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x01, %xmm20, %zmm4, %zmm4 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x00, %xmm20, %zmm4, %zmm4 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $3, %xmm20, %zmm5, %zmm5 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $2, %xmm20, %zmm5, %zmm5 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x01, %xmm20, %zmm5, %zmm5 + vpclmulqdq $0x00, %xmm1, %xmm20, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm20, %xmm18 + vpclmulqdq $16, %xmm1, %xmm20, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm20, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm20 + vinserti32x4 $0x00, %xmm20, %zmm5, %zmm5 + movl %ecx, %r8d + shll $4, %r8d + xorl %eax, %eax + movl %r8d, %r9d + andl $0xffffff00, %r9d + cmpl %r9d, %eax + je L_AES_GCMSIV_polyval_avx512_sixteen_done +L_AES_GCMSIV_polyval_avx512_sixteen: + leaq (%rdx,%rax,1), %r10 + vmovdqu64 (%r10), %zmm6 + vmovdqu64 64(%r10), %zmm7 + vmovdqu64 128(%r10), %zmm8 + vmovdqu64 192(%r10), %zmm9 + vpxorq %zmm16, %zmm16, %zmm16 + vinserti32x4 $0x00, %xmm0, %zmm16, %zmm16 + vpxorq %zmm16, %zmm6, %zmm6 + vpclmulqdq $0x00, %zmm5, %zmm6, %zmm10 + vpclmulqdq $0x11, %zmm5, %zmm6, %zmm11 + vpclmulqdq $16, %zmm5, %zmm6, %zmm14 + vpclmulqdq $0x01, %zmm5, %zmm6, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm10, %zmm10 + vpxorq %zmm14, %zmm11, %zmm11 + vpclmulqdq $0x00, %zmm4, %zmm7, %zmm12 + vpclmulqdq $0x11, %zmm4, %zmm7, %zmm13 + vpclmulqdq $16, %zmm4, %zmm7, %zmm14 + vpclmulqdq $0x01, %zmm4, %zmm7, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm12, %zmm12 + vpxorq %zmm14, %zmm13, %zmm13 + vpxorq %zmm12, %zmm10, %zmm10 + vpxorq %zmm13, %zmm11, %zmm11 + vpclmulqdq $0x00, %zmm3, %zmm8, %zmm12 + vpclmulqdq $0x11, %zmm3, %zmm8, %zmm13 + vpclmulqdq $16, %zmm3, %zmm8, %zmm14 + vpclmulqdq $0x01, %zmm3, %zmm8, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm12, %zmm12 + vpxorq %zmm14, %zmm13, %zmm13 + vpxorq %zmm12, %zmm10, %zmm10 + vpxorq %zmm13, %zmm11, %zmm11 + vpclmulqdq $0x00, %zmm2, %zmm9, %zmm12 + vpclmulqdq $0x11, %zmm2, %zmm9, %zmm13 + vpclmulqdq $16, %zmm2, %zmm9, %zmm14 + vpclmulqdq $0x01, %zmm2, %zmm9, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm12, %zmm12 + vpxorq %zmm14, %zmm13, %zmm13 + vpxorq %zmm12, %zmm10, %zmm10 + vpxorq %zmm13, %zmm11, %zmm11 + vextracti32x4 $0x01, %zmm10, %xmm19 + vextracti32x4 $2, %zmm10, %xmm21 + vextracti32x4 $3, %zmm10, %xmm22 + vpxorq %xmm19, %xmm10, %xmm17 + vpternlogq $0x96, %xmm21, %xmm22, %xmm17 + vextracti32x4 $0x01, %zmm11, %xmm19 + vextracti32x4 $2, %zmm11, %xmm21 + vextracti32x4 $3, %zmm11, %xmm22 + vpxorq %xmm19, %xmm11, %xmm18 + vpternlogq $0x96, %xmm21, %xmm22, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm0 + addl $0x100, %eax + cmpl %r9d, %eax + jl L_AES_GCMSIV_polyval_avx512_sixteen +L_AES_GCMSIV_polyval_avx512_sixteen_done: + movl %r8d, %r9d + andl $0xffffff80, %r9d + cmpl %r9d, %eax + jge L_AES_GCMSIV_polyval_avx512_eight_done + leaq (%rdx,%rax,1), %r10 + vmovdqu64 (%r10), %zmm6 + vmovdqu64 64(%r10), %zmm7 + vpxorq %zmm16, %zmm16, %zmm16 + vinserti32x4 $0x00, %xmm0, %zmm16, %zmm16 + vpxorq %zmm16, %zmm6, %zmm6 + vpclmulqdq $0x00, %zmm3, %zmm6, %zmm10 + vpclmulqdq $0x11, %zmm3, %zmm6, %zmm11 + vpclmulqdq $16, %zmm3, %zmm6, %zmm14 + vpclmulqdq $0x01, %zmm3, %zmm6, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm10, %zmm10 + vpxorq %zmm14, %zmm11, %zmm11 + vpclmulqdq $0x00, %zmm2, %zmm7, %zmm12 + vpclmulqdq $0x11, %zmm2, %zmm7, %zmm13 + vpclmulqdq $16, %zmm2, %zmm7, %zmm14 + vpclmulqdq $0x01, %zmm2, %zmm7, %zmm15 + vpxorq %zmm15, %zmm14, %zmm14 + vpslldq $8, %zmm14, %zmm15 + vpsrldq $8, %zmm14, %zmm14 + vpxorq %zmm15, %zmm12, %zmm12 + vpxorq %zmm14, %zmm13, %zmm13 + vpxorq %zmm12, %zmm10, %zmm10 + vpxorq %zmm13, %zmm11, %zmm11 + vextracti32x4 $0x01, %zmm10, %xmm19 + vextracti32x4 $2, %zmm10, %xmm21 + vextracti32x4 $3, %zmm10, %xmm22 + vpxorq %xmm19, %xmm10, %xmm17 + vpternlogq $0x96, %xmm21, %xmm22, %xmm17 + vextracti32x4 $0x01, %zmm11, %xmm19 + vextracti32x4 $2, %zmm11, %xmm21 + vextracti32x4 $3, %zmm11, %xmm22 + vpxorq %xmm19, %xmm11, %xmm18 + vpternlogq $0x96, %xmm21, %xmm22, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm0 + addl $0x80, %eax +L_AES_GCMSIV_polyval_avx512_eight_done: +L_AES_GCMSIV_polyval_avx512_rem: + cmpl %r8d, %eax + jge L_AES_GCMSIV_polyval_avx512_done + vpxorq (%rdx,%rax,1), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm17 + vpclmulqdq $0x11, %xmm1, %xmm0, %xmm18 + vpclmulqdq $16, %xmm1, %xmm0, %xmm19 + vpclmulqdq $0x01, %xmm1, %xmm0, %xmm6 + vpxorq %xmm6, %xmm19, %xmm19 + vpslldq $8, %xmm19, %xmm6 + vpsrldq $8, %xmm19, %xmm19 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm19, %xmm18, %xmm18 + vpsrld $31, %xmm17, %xmm9 + vpsrld $31, %xmm18, %xmm10 + vpslld $0x01, %xmm17, %xmm17 + vpslld $0x01, %xmm18, %xmm18 + vpsrldq $12, %xmm9, %xmm11 + vpslldq $4, %xmm10, %xmm10 + vpslldq $4, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpternlogq $0x96, %xmm10, %xmm11, %xmm18 + vpslld $31, %xmm17, %xmm9 + vpslld $30, %xmm17, %xmm10 + vpslld $25, %xmm17, %xmm11 + vpternlogq $0x96, %xmm10, %xmm11, %xmm9 + vpsrldq $4, %xmm9, %xmm10 + vpslldq $12, %xmm9, %xmm9 + vpxorq %xmm9, %xmm17, %xmm17 + vpsrld $0x01, %xmm17, %xmm6 + vpsrld $2, %xmm17, %xmm7 + vpsrld $7, %xmm17, %xmm8 + vpternlogq $0x96, %xmm7, %xmm8, %xmm6 + vpxorq %xmm10, %xmm6, %xmm6 + vpxorq %xmm6, %xmm17, %xmm17 + vpxorq %xmm17, %xmm18, %xmm0 + addl $16, %eax + jmp L_AES_GCMSIV_polyval_avx512_rem +L_AES_GCMSIV_polyval_avx512_done: + vpshufb L_aes_gcm_siv_bswap_mask_avx512(%rip), %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_polyval_avx512,.-AES_GCMSIV_polyval_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_gcmsiv_ctr_inc_avx512: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_avx512 +.type AES_GCMSIV_ctr_avx512,@function +.align 16 +AES_GCMSIV_ctr_avx512: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_avx512 +.p2align 4 +_AES_GCMSIV_ctr_avx512: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti32x4 (%r9), %zmm7 + vbroadcasti32x4 256+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm8 + vbroadcasti32x4 64+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm9 + vbroadcasti32x4 16+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm10 + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_GCMSIV_ctr_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm12 + vbroadcasti32x4 16(%rcx), %zmm13 + vbroadcasti32x4 32(%rcx), %zmm14 + vbroadcasti32x4 48(%rcx), %zmm15 + vbroadcasti32x4 64(%rcx), %zmm16 + vbroadcasti32x4 80(%rcx), %zmm17 + vbroadcasti32x4 96(%rcx), %zmm18 + vbroadcasti32x4 112(%rcx), %zmm19 + vbroadcasti32x4 128(%rcx), %zmm20 + vbroadcasti32x4 144(%rcx), %zmm21 + vbroadcasti32x4 160(%rcx), %zmm22 + cmpl $11, %r8d + jl L_AES_GCMSIV_ctr_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm23 + vbroadcasti32x4 192(%rcx), %zmm24 + cmpl $13, %r8d + jl L_AES_GCMSIV_ctr_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm25 + vbroadcasti32x4 224(%rcx), %zmm26 +L_AES_GCMSIV_ctr_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r10d + jl L_AES_GCMSIV_ctr_avx512_done_256 + andl $0xffffff00, %r10d + vpaddd 0+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm7, %zmm4 + vpaddd 64+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm7, %zmm5 + vpaddd 128+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm7, %zmm6 + vpaddd 192+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm7, %zmm7 +L_AES_GCMSIV_ctr_avx512_enc_256: + # 256 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vmovdqa64 %zmm4, %zmm0 + vmovdqa64 %zmm5, %zmm1 + vmovdqa64 %zmm6, %zmm2 + vmovdqa64 %zmm7, %zmm3 + vpaddd %zmm8, %zmm4, %zmm4 + vpaddd %zmm8, %zmm5, %zmm5 + vpaddd %zmm8, %zmm6, %zmm6 + vpaddd %zmm8, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm12, %zmm0, %zmm0 + vpxorq %zmm12, %zmm1, %zmm1 + vpxorq %zmm12, %zmm2, %zmm2 + vpxorq %zmm12, %zmm3, %zmm3 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm13, %zmm1, %zmm1 + vaesenc %zmm13, %zmm2, %zmm2 + vaesenc %zmm13, %zmm3, %zmm3 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm14, %zmm1, %zmm1 + vaesenc %zmm14, %zmm2, %zmm2 + vaesenc %zmm14, %zmm3, %zmm3 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm15, %zmm1, %zmm1 + vaesenc %zmm15, %zmm2, %zmm2 + vaesenc %zmm15, %zmm3, %zmm3 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm16, %zmm1, %zmm1 + vaesenc %zmm16, %zmm2, %zmm2 + vaesenc %zmm16, %zmm3, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm22, %zmm11 + jl L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm24, %zmm11 + jl L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + vmovdqa64 %zmm26, %zmm11 +L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last: + vaesenclast %zmm11, %zmm0, %zmm0 + vaesenclast %zmm11, %zmm1, %zmm1 + vaesenclast %zmm11, %zmm2, %zmm2 + vaesenclast %zmm11, %zmm3, %zmm3 + vpxorq (%r11), %zmm0, %zmm0 + vpxorq 64(%r11), %zmm1, %zmm1 + vpxorq 128(%r11), %zmm2, %zmm2 + vpxorq 192(%r11), %zmm3, %zmm3 + vmovdqu64 %zmm0, (%rbx) + vmovdqu64 %zmm1, 64(%rbx) + vmovdqu64 %zmm2, 128(%rbx) + vmovdqu64 %zmm3, 192(%rbx) + addl $0x100, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_avx512_enc_256 + vshufi64x2 $0x00, %zmm4, %zmm4, %zmm7 +L_AES_GCMSIV_ctr_avx512_done_256: + movl %edx, %r10d + andl $0xffffffc0, %r10d + cmpl %r10d, %eax + je L_AES_GCMSIV_ctr_avx512_done_64 +L_AES_GCMSIV_ctr_avx512_enc_64: + # 64 bytes of input + # siv_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddd 0+L_aes_gcmsiv_ctr_inc_avx512(%rip), %zmm7, %zmm0 + vpaddd %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm12, %zmm0, %zmm0 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm22, %zmm11 + jl L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm24, %zmm11 + jl L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + vmovdqa64 %zmm26, %zmm11 +L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last: + vaesenclast %zmm11, %zmm0, %zmm0 + vpxorq (%r11), %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_avx512_enc_64 +L_AES_GCMSIV_ctr_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_GCMSIV_ctr_avx512_done_enc + andl $0xfffffff0, %r10d +L_AES_GCMSIV_ctr_avx512_enc_16: + # 16 bytes of input + vmovdqa %xmm7, %xmm0 + vpaddd %zmm10, %zmm7, %zmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_GCMSIV_ctr_avx512_enc_16 +L_AES_GCMSIV_ctr_avx512_done_enc: + vmovdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_GCMSIV_ctr_avx512,.-AES_GCMSIV_ctr_avx512 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* HAVE_INTEL_AVX512 */ #endif /* WOLFSSL_X86_64_BUILD */ diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index 34f6847631..50171681b4 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -6469,6 +6469,567 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_done: ret AES_GCM_decrypt_final_aesni ENDP _TEXT ENDS +IFDEF WOLFSSL_AESGCM_SIV +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_siv_bswap_mask QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_gcm_siv_bswap_mask QWORD L_aes_gcm_siv_bswap_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_polyval_aesni PROC + push r12 + sub rsp, 144 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu OWORD PTR [rsp+48], xmm9 + movdqu OWORD PTR [rsp+64], xmm10 + movdqu OWORD PTR [rsp+80], xmm11 + movdqu OWORD PTR [rsp+96], xmm12 + movdqu OWORD PTR [rsp+112], xmm13 + movdqu OWORD PTR [rsp+128], xmm14 + movdqu xmm1, OWORD PTR [rdx] + movdqu xmm0, OWORD PTR [rcx] + pshufb xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask + movdqa xmm9, xmm1 + pclmulqdq xmm9, xmm1, 0 + movdqa xmm10, xmm1 + pclmulqdq xmm10, xmm1, 17 + movdqa xmm13, xmm1 + pclmulqdq xmm13, xmm1, 16 + movdqa xmm14, xmm1 + pclmulqdq xmm14, xmm1, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm9, xmm14 + pxor xmm10, xmm13 + movdqa xmm5, xmm9 + psrld xmm5, 31 + movdqa xmm6, xmm10 + psrld xmm6, 31 + pslld xmm9, 1 + pslld xmm10, 1 + movdqa xmm7, xmm5 + psrldq xmm7, 12 + pslldq xmm6, 4 + pslldq xmm5, 4 + por xmm9, xmm5 + por xmm10, xmm6 + por xmm10, xmm7 + movdqa xmm5, xmm9 + pslld xmm5, 31 + movdqa xmm6, xmm9 + pslld xmm6, 30 + movdqa xmm7, xmm9 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + psrldq xmm6, 4 + pslldq xmm5, 12 + pxor xmm9, xmm5 + movdqa xmm8, xmm9 + psrld xmm8, 1 + movdqa xmm11, xmm9 + psrld xmm11, 2 + movdqa xmm12, xmm9 + psrld xmm12, 7 + pxor xmm8, xmm11 + pxor xmm8, xmm12 + pxor xmm8, xmm6 + pxor xmm9, xmm8 + pxor xmm10, xmm9 + movdqa xmm2, xmm10 + movdqa xmm9, xmm2 + pclmulqdq xmm9, xmm1, 0 + movdqa xmm10, xmm2 + pclmulqdq xmm10, xmm1, 17 + movdqa xmm13, xmm2 + pclmulqdq xmm13, xmm1, 16 + movdqa xmm14, xmm2 + pclmulqdq xmm14, xmm1, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm9, xmm14 + pxor xmm10, xmm13 + movdqa xmm5, xmm9 + psrld xmm5, 31 + movdqa xmm6, xmm10 + psrld xmm6, 31 + pslld xmm9, 1 + pslld xmm10, 1 + movdqa xmm7, xmm5 + psrldq xmm7, 12 + pslldq xmm6, 4 + pslldq xmm5, 4 + por xmm9, xmm5 + por xmm10, xmm6 + por xmm10, xmm7 + movdqa xmm5, xmm9 + pslld xmm5, 31 + movdqa xmm6, xmm9 + pslld xmm6, 30 + movdqa xmm7, xmm9 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + psrldq xmm6, 4 + pslldq xmm5, 12 + pxor xmm9, xmm5 + movdqa xmm8, xmm9 + psrld xmm8, 1 + movdqa xmm11, xmm9 + psrld xmm11, 2 + movdqa xmm12, xmm9 + psrld xmm12, 7 + pxor xmm8, xmm11 + pxor xmm8, xmm12 + pxor xmm8, xmm6 + pxor xmm9, xmm8 + pxor xmm10, xmm9 + movdqa xmm3, xmm10 + movdqa xmm9, xmm2 + pclmulqdq xmm9, xmm2, 0 + movdqa xmm10, xmm2 + pclmulqdq xmm10, xmm2, 17 + movdqa xmm13, xmm2 + pclmulqdq xmm13, xmm2, 16 + movdqa xmm14, xmm2 + pclmulqdq xmm14, xmm2, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm9, xmm14 + pxor xmm10, xmm13 + movdqa xmm5, xmm9 + psrld xmm5, 31 + movdqa xmm6, xmm10 + psrld xmm6, 31 + pslld xmm9, 1 + pslld xmm10, 1 + movdqa xmm7, xmm5 + psrldq xmm7, 12 + pslldq xmm6, 4 + pslldq xmm5, 4 + por xmm9, xmm5 + por xmm10, xmm6 + por xmm10, xmm7 + movdqa xmm5, xmm9 + pslld xmm5, 31 + movdqa xmm6, xmm9 + pslld xmm6, 30 + movdqa xmm7, xmm9 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + psrldq xmm6, 4 + pslldq xmm5, 12 + pxor xmm9, xmm5 + movdqa xmm8, xmm9 + psrld xmm8, 1 + movdqa xmm11, xmm9 + psrld xmm11, 2 + movdqa xmm12, xmm9 + psrld xmm12, 7 + pxor xmm8, xmm11 + pxor xmm8, xmm12 + pxor xmm8, xmm6 + pxor xmm9, xmm8 + pxor xmm10, xmm9 + movdqa xmm4, xmm10 + mov r10d, r9d + shl r10d, 4 + mov r11d, r10d + and r11d, 4294967232 + xor eax, eax + cmp eax, r11d + je L_AES_GCMSIV_polyval_aesni_four_done +L_AES_GCMSIV_polyval_aesni_four: + lea r12, QWORD PTR [r8+rax] + movdqu xmm5, OWORD PTR [r12] + movdqu xmm6, OWORD PTR [r12+16] + movdqu xmm7, OWORD PTR [r12+32] + movdqu xmm8, OWORD PTR [r12+48] + pxor xmm5, xmm0 + movdqa xmm9, xmm5 + pclmulqdq xmm9, xmm4, 0 + movdqa xmm10, xmm5 + pclmulqdq xmm10, xmm4, 17 + movdqa xmm13, xmm5 + pclmulqdq xmm13, xmm4, 16 + movdqa xmm14, xmm5 + pclmulqdq xmm14, xmm4, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm9, xmm14 + pxor xmm10, xmm13 + movdqa xmm11, xmm6 + pclmulqdq xmm11, xmm3, 0 + movdqa xmm12, xmm6 + pclmulqdq xmm12, xmm3, 17 + movdqa xmm13, xmm6 + pclmulqdq xmm13, xmm3, 16 + movdqa xmm14, xmm6 + pclmulqdq xmm14, xmm3, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm11, xmm14 + pxor xmm12, xmm13 + pxor xmm9, xmm11 + pxor xmm10, xmm12 + movdqa xmm11, xmm7 + pclmulqdq xmm11, xmm2, 0 + movdqa xmm12, xmm7 + pclmulqdq xmm12, xmm2, 17 + movdqa xmm13, xmm7 + pclmulqdq xmm13, xmm2, 16 + movdqa xmm14, xmm7 + pclmulqdq xmm14, xmm2, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm11, xmm14 + pxor xmm12, xmm13 + pxor xmm9, xmm11 + pxor xmm10, xmm12 + movdqa xmm11, xmm8 + pclmulqdq xmm11, xmm1, 0 + movdqa xmm12, xmm8 + pclmulqdq xmm12, xmm1, 17 + movdqa xmm13, xmm8 + pclmulqdq xmm13, xmm1, 16 + movdqa xmm14, xmm8 + pclmulqdq xmm14, xmm1, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm11, xmm14 + pxor xmm12, xmm13 + pxor xmm9, xmm11 + pxor xmm10, xmm12 + movdqa xmm5, xmm9 + psrld xmm5, 31 + movdqa xmm6, xmm10 + psrld xmm6, 31 + pslld xmm9, 1 + pslld xmm10, 1 + movdqa xmm7, xmm5 + psrldq xmm7, 12 + pslldq xmm6, 4 + pslldq xmm5, 4 + por xmm9, xmm5 + por xmm10, xmm6 + por xmm10, xmm7 + movdqa xmm5, xmm9 + pslld xmm5, 31 + movdqa xmm6, xmm9 + pslld xmm6, 30 + movdqa xmm7, xmm9 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + psrldq xmm6, 4 + pslldq xmm5, 12 + pxor xmm9, xmm5 + movdqa xmm8, xmm9 + psrld xmm8, 1 + movdqa xmm11, xmm9 + psrld xmm11, 2 + movdqa xmm12, xmm9 + psrld xmm12, 7 + pxor xmm8, xmm11 + pxor xmm8, xmm12 + pxor xmm8, xmm6 + pxor xmm9, xmm8 + pxor xmm10, xmm9 + movdqa xmm0, xmm10 + add eax, 64 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_aesni_four +L_AES_GCMSIV_polyval_aesni_four_done: +L_AES_GCMSIV_polyval_aesni_rem: + cmp eax, r10d + jge L_AES_GCMSIV_polyval_aesni_done + pxor xmm0, [r8+rax] + movdqa xmm9, xmm0 + pclmulqdq xmm9, xmm1, 0 + movdqa xmm10, xmm0 + pclmulqdq xmm10, xmm1, 17 + movdqa xmm13, xmm0 + pclmulqdq xmm13, xmm1, 16 + movdqa xmm14, xmm0 + pclmulqdq xmm14, xmm1, 1 + pxor xmm13, xmm14 + movdqa xmm14, xmm13 + pslldq xmm14, 8 + psrldq xmm13, 8 + pxor xmm9, xmm14 + pxor xmm10, xmm13 + movdqa xmm5, xmm9 + psrld xmm5, 31 + movdqa xmm6, xmm10 + psrld xmm6, 31 + pslld xmm9, 1 + pslld xmm10, 1 + movdqa xmm7, xmm5 + psrldq xmm7, 12 + pslldq xmm6, 4 + pslldq xmm5, 4 + por xmm9, xmm5 + por xmm10, xmm6 + por xmm10, xmm7 + movdqa xmm5, xmm9 + pslld xmm5, 31 + movdqa xmm6, xmm9 + pslld xmm6, 30 + movdqa xmm7, xmm9 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + psrldq xmm6, 4 + pslldq xmm5, 12 + pxor xmm9, xmm5 + movdqa xmm8, xmm9 + psrld xmm8, 1 + movdqa xmm11, xmm9 + psrld xmm11, 2 + movdqa xmm12, xmm9 + psrld xmm12, 7 + pxor xmm8, xmm11 + pxor xmm8, xmm12 + pxor xmm8, xmm6 + pxor xmm9, xmm8 + pxor xmm10, xmm9 + movdqa xmm0, xmm10 + add eax, 16 + jmp L_AES_GCMSIV_polyval_aesni_rem +L_AES_GCMSIV_polyval_aesni_done: + pshufb xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask + movdqu OWORD PTR [rcx], xmm0 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + movdqu xmm9, OWORD PTR [rsp+48] + movdqu xmm10, OWORD PTR [rsp+64] + movdqu xmm11, OWORD PTR [rsp+80] + movdqu xmm12, OWORD PTR [rsp+96] + movdqu xmm13, OWORD PTR [rsp+112] + movdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop r12 + ret +AES_GCMSIV_polyval_aesni ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcmsiv_ctr_aesni_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_gcmsiv_ctr_aesni_one QWORD L_aes_gcmsiv_ctr_aesni_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_ctr_aesni PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 48 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu xmm8, OWORD PTR L_aes_gcmsiv_ctr_aesni_one + movdqu xmm7, OWORD PTR [r10] + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_GCMSIV_ctr_aesni_done_64 + and r10d, 4294967232 +L_AES_GCMSIV_ctr_aesni_enc_64: + ; 64 bytes of input + ; siv_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + movdqa xmm0, xmm7 + paddd xmm7, xmm8 + movdqa xmm1, xmm7 + paddd xmm7, xmm8 + movdqa xmm2, xmm7 + paddd xmm7, xmm8 + movdqa xmm3, xmm7 + paddd xmm7, xmm8 + ; aes_enc_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_aesni_64_aes_enc_block_last: + aesenclast xmm0, xmm4 + aesenclast xmm1, xmm4 + aesenclast xmm2, xmm4 + aesenclast xmm3, xmm4 + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + movdqu xmm4, OWORD PTR [r11+16] + pxor xmm1, xmm4 + movdqu xmm4, OWORD PTR [r11+32] + pxor xmm2, xmm4 + movdqu xmm4, OWORD PTR [r11+48] + pxor xmm3, xmm4 + movdqu OWORD PTR [rbx], xmm0 + movdqu OWORD PTR [rbx+16], xmm1 + movdqu OWORD PTR [rbx+32], xmm2 + movdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_aesni_enc_64 +L_AES_GCMSIV_ctr_aesni_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_GCMSIV_ctr_aesni_done_enc + and r10d, 4294967280 +L_AES_GCMSIV_ctr_aesni_enc_16: + ; 16 bytes of input + movdqa xmm0, xmm7 + paddd xmm7, xmm8 + ; aes_enc_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesenc xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesenc xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesenc xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_aesni_16_aes_enc_block_last: + aesenclast xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_aesni_enc_16 +L_AES_GCMSIV_ctr_aesni_done_enc: + movdqu OWORD PTR [r10], xmm7 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop rbx + ret +AES_GCMSIV_ctr_aesni ENDP +_TEXT ENDS +ENDIF IFDEF HAVE_INTEL_AVX1 _DATA SEGMENT ALIGN 16 @@ -11929,6 +12490,463 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_done: ret AES_GCM_decrypt_final_avx1 ENDP _TEXT ENDS +IFDEF WOLFSSL_AESGCM_SIV +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_siv_bswap_mask_avx1 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_gcm_siv_bswap_mask_avx1 QWORD L_aes_gcm_siv_bswap_mask_avx1 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_polyval_avx1 PROC + push r12 + sub rsp, 112 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu xmm1, OWORD PTR [rdx] + vmovdqu xmm0, OWORD PTR [rcx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_avx1 + vpclmulqdq xmm5, xmm1, xmm1, 0 + vpclmulqdq xmm6, xmm1, xmm1, 17 + vpclmulqdq xmm10, xmm1, xmm1, 16 + vpclmulqdq xmm11, xmm1, xmm1, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm5, xmm5, xmm11 + vpxor xmm6, xmm6, xmm10 + vpsrld xmm10, xmm5, 31 + vpsrld xmm11, xmm6, 31 + vpslld xmm5, xmm5, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm12, xmm10, 12 + vpslldq xmm11, xmm11, 4 + vpslldq xmm10, xmm10, 4 + vpor xmm5, xmm5, xmm10 + vpor xmm6, xmm6, xmm11 + vpor xmm6, xmm6, xmm12 + vpslld xmm10, xmm5, 31 + vpslld xmm11, xmm5, 30 + vpslld xmm12, xmm5, 25 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm12 + vpsrldq xmm11, xmm10, 4 + vpslldq xmm10, xmm10, 12 + vpxor xmm5, xmm5, xmm10 + vpsrld xmm7, xmm5, 1 + vpsrld xmm8, xmm5, 2 + vpsrld xmm9, xmm5, 7 + vpxor xmm7, xmm7, xmm8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm7, xmm7, xmm11 + vpxor xmm5, xmm5, xmm7 + vpxor xmm2, xmm6, xmm5 + vpclmulqdq xmm5, xmm2, xmm1, 0 + vpclmulqdq xmm6, xmm2, xmm1, 17 + vpclmulqdq xmm10, xmm2, xmm1, 16 + vpclmulqdq xmm11, xmm2, xmm1, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm5, xmm5, xmm11 + vpxor xmm6, xmm6, xmm10 + vpsrld xmm10, xmm5, 31 + vpsrld xmm11, xmm6, 31 + vpslld xmm5, xmm5, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm12, xmm10, 12 + vpslldq xmm11, xmm11, 4 + vpslldq xmm10, xmm10, 4 + vpor xmm5, xmm5, xmm10 + vpor xmm6, xmm6, xmm11 + vpor xmm6, xmm6, xmm12 + vpslld xmm10, xmm5, 31 + vpslld xmm11, xmm5, 30 + vpslld xmm12, xmm5, 25 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm12 + vpsrldq xmm11, xmm10, 4 + vpslldq xmm10, xmm10, 12 + vpxor xmm5, xmm5, xmm10 + vpsrld xmm7, xmm5, 1 + vpsrld xmm8, xmm5, 2 + vpsrld xmm9, xmm5, 7 + vpxor xmm7, xmm7, xmm8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm7, xmm7, xmm11 + vpxor xmm5, xmm5, xmm7 + vpxor xmm3, xmm6, xmm5 + vpclmulqdq xmm5, xmm2, xmm2, 0 + vpclmulqdq xmm6, xmm2, xmm2, 17 + vpclmulqdq xmm10, xmm2, xmm2, 16 + vpclmulqdq xmm11, xmm2, xmm2, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm5, xmm5, xmm11 + vpxor xmm6, xmm6, xmm10 + vpsrld xmm10, xmm5, 31 + vpsrld xmm11, xmm6, 31 + vpslld xmm5, xmm5, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm12, xmm10, 12 + vpslldq xmm11, xmm11, 4 + vpslldq xmm10, xmm10, 4 + vpor xmm5, xmm5, xmm10 + vpor xmm6, xmm6, xmm11 + vpor xmm6, xmm6, xmm12 + vpslld xmm10, xmm5, 31 + vpslld xmm11, xmm5, 30 + vpslld xmm12, xmm5, 25 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm12 + vpsrldq xmm11, xmm10, 4 + vpslldq xmm10, xmm10, 12 + vpxor xmm5, xmm5, xmm10 + vpsrld xmm7, xmm5, 1 + vpsrld xmm8, xmm5, 2 + vpsrld xmm9, xmm5, 7 + vpxor xmm7, xmm7, xmm8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm7, xmm7, xmm11 + vpxor xmm5, xmm5, xmm7 + vpxor xmm4, xmm6, xmm5 + mov r10d, r9d + shl r10d, 4 + mov r11d, r10d + and r11d, 4294967232 + xor eax, eax + cmp eax, r11d + je L_AES_GCMSIV_polyval_avx1_four_done +L_AES_GCMSIV_polyval_avx1_four: + lea r12, QWORD PTR [r8+rax] + vmovdqu xmm7, OWORD PTR [r12] + vpxor xmm7, xmm7, xmm0 + vpclmulqdq xmm5, xmm7, xmm4, 0 + vpclmulqdq xmm6, xmm7, xmm4, 17 + vpclmulqdq xmm10, xmm7, xmm4, 16 + vpclmulqdq xmm11, xmm7, xmm4, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm5, xmm5, xmm11 + vpxor xmm6, xmm6, xmm10 + vmovdqu xmm7, OWORD PTR [r12+16] + vpclmulqdq xmm8, xmm7, xmm3, 0 + vpclmulqdq xmm9, xmm7, xmm3, 17 + vpclmulqdq xmm10, xmm7, xmm3, 16 + vpclmulqdq xmm11, xmm7, xmm3, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm8, xmm8, xmm11 + vpxor xmm9, xmm9, xmm10 + vpxor xmm5, xmm5, xmm8 + vpxor xmm6, xmm6, xmm9 + vmovdqu xmm7, OWORD PTR [r12+32] + vpclmulqdq xmm8, xmm7, xmm2, 0 + vpclmulqdq xmm9, xmm7, xmm2, 17 + vpclmulqdq xmm10, xmm7, xmm2, 16 + vpclmulqdq xmm11, xmm7, xmm2, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm8, xmm8, xmm11 + vpxor xmm9, xmm9, xmm10 + vpxor xmm5, xmm5, xmm8 + vpxor xmm6, xmm6, xmm9 + vmovdqu xmm7, OWORD PTR [r12+48] + vpclmulqdq xmm8, xmm7, xmm1, 0 + vpclmulqdq xmm9, xmm7, xmm1, 17 + vpclmulqdq xmm10, xmm7, xmm1, 16 + vpclmulqdq xmm11, xmm7, xmm1, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm8, xmm8, xmm11 + vpxor xmm9, xmm9, xmm10 + vpxor xmm5, xmm5, xmm8 + vpxor xmm6, xmm6, xmm9 + vpsrld xmm10, xmm5, 31 + vpsrld xmm11, xmm6, 31 + vpslld xmm5, xmm5, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm12, xmm10, 12 + vpslldq xmm11, xmm11, 4 + vpslldq xmm10, xmm10, 4 + vpor xmm5, xmm5, xmm10 + vpor xmm6, xmm6, xmm11 + vpor xmm6, xmm6, xmm12 + vpslld xmm10, xmm5, 31 + vpslld xmm11, xmm5, 30 + vpslld xmm12, xmm5, 25 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm12 + vpsrldq xmm11, xmm10, 4 + vpslldq xmm10, xmm10, 12 + vpxor xmm5, xmm5, xmm10 + vpsrld xmm7, xmm5, 1 + vpsrld xmm8, xmm5, 2 + vpsrld xmm9, xmm5, 7 + vpxor xmm7, xmm7, xmm8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm7, xmm7, xmm11 + vpxor xmm5, xmm5, xmm7 + vpxor xmm0, xmm6, xmm5 + add eax, 64 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_avx1_four +L_AES_GCMSIV_polyval_avx1_four_done: +L_AES_GCMSIV_polyval_avx1_rem: + cmp eax, r10d + jge L_AES_GCMSIV_polyval_avx1_done + vpxor xmm0, xmm0, [r8+rax] + vpclmulqdq xmm5, xmm0, xmm1, 0 + vpclmulqdq xmm6, xmm0, xmm1, 17 + vpclmulqdq xmm10, xmm0, xmm1, 16 + vpclmulqdq xmm11, xmm0, xmm1, 1 + vpxor xmm10, xmm10, xmm11 + vpslldq xmm11, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm5, xmm5, xmm11 + vpxor xmm6, xmm6, xmm10 + vpsrld xmm10, xmm5, 31 + vpsrld xmm11, xmm6, 31 + vpslld xmm5, xmm5, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm12, xmm10, 12 + vpslldq xmm11, xmm11, 4 + vpslldq xmm10, xmm10, 4 + vpor xmm5, xmm5, xmm10 + vpor xmm6, xmm6, xmm11 + vpor xmm6, xmm6, xmm12 + vpslld xmm10, xmm5, 31 + vpslld xmm11, xmm5, 30 + vpslld xmm12, xmm5, 25 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm12 + vpsrldq xmm11, xmm10, 4 + vpslldq xmm10, xmm10, 12 + vpxor xmm5, xmm5, xmm10 + vpsrld xmm7, xmm5, 1 + vpsrld xmm8, xmm5, 2 + vpsrld xmm9, xmm5, 7 + vpxor xmm7, xmm7, xmm8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm7, xmm7, xmm11 + vpxor xmm5, xmm5, xmm7 + vpxor xmm0, xmm6, xmm5 + add eax, 16 + jmp L_AES_GCMSIV_polyval_avx1_rem +L_AES_GCMSIV_polyval_avx1_done: + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_avx1 + vmovdqu OWORD PTR [rcx], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + add rsp, 112 + pop r12 + ret +AES_GCMSIV_polyval_avx1 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcmsiv_ctr_avx1_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_gcmsiv_ctr_avx1_one QWORD L_aes_gcmsiv_ctr_avx1_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_ctr_avx1 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu xmm8, OWORD PTR L_aes_gcmsiv_ctr_avx1_one + vmovdqu xmm7, OWORD PTR [r10] + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_GCMSIV_ctr_avx1_done_64 + and r10d, 4294967232 +L_AES_GCMSIV_ctr_avx1_enc_64: + ; 64 bytes of input + ; siv_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vmovdqa xmm0, xmm7 + vpaddd xmm7, xmm7, xmm8 + vmovdqa xmm1, xmm7 + vpaddd xmm7, xmm7, xmm8 + vmovdqa xmm2, xmm7 + vpaddd xmm7, xmm7, xmm8 + vmovdqa xmm3, xmm7 + vpaddd xmm7, xmm7, xmm8 + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_avx1_64_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vaesenclast xmm2, xmm2, xmm4 + vaesenclast xmm3, xmm3, xmm4 + vpxor xmm0, xmm0, [r11] + vpxor xmm1, xmm1, [r11+16] + vpxor xmm2, xmm2, [r11+32] + vpxor xmm3, xmm3, [r11+48] + vmovdqu OWORD PTR [rbx], xmm0 + vmovdqu OWORD PTR [rbx+16], xmm1 + vmovdqu OWORD PTR [rbx+32], xmm2 + vmovdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_avx1_enc_64 +L_AES_GCMSIV_ctr_avx1_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_GCMSIV_ctr_avx1_done_enc + and r10d, 4294967280 +L_AES_GCMSIV_ctr_avx1_enc_16: + ; 16 bytes of input + vmovdqa xmm0, xmm7 + vpaddd xmm7, xmm7, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_avx1_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_avx1_enc_16 +L_AES_GCMSIV_ctr_avx1_done_enc: + vmovdqu OWORD PTR [r10], xmm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop rbx + ret +AES_GCMSIV_ctr_avx1 ENDP +_TEXT ENDS +ENDIF ENDIF IFDEF HAVE_INTEL_AVX2 _DATA SEGMENT @@ -22368,6 +23386,814 @@ L_AES_GCM_decrypt_final_vaes_cmp_tag_done: ret AES_GCM_decrypt_final_vaes ENDP _TEXT ENDS +IFDEF WOLFSSL_AESGCM_SIV +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_siv_bswap_mask_vaes QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_gcm_siv_bswap_mask_vaes QWORD L_aes_gcm_siv_bswap_mask_vaes +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_polyval_vaes PROC + push r12 + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm1, OWORD PTR [rdx] + vmovdqu xmm0, OWORD PTR [rcx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_vaes + vpclmulqdq xmm14, xmm1, xmm1, 0 + vpclmulqdq xmm15, xmm1, xmm1, 17 + vpclmulqdq xmm13, xmm1, xmm1, 16 + vpclmulqdq xmm6, xmm1, xmm1, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm2, xmm15, xmm14 + vinserti128 ymm2, ymm2, xmm1, 1 + vpclmulqdq xmm14, xmm2, xmm2, 0 + vpclmulqdq xmm15, xmm2, xmm2, 17 + vpclmulqdq xmm13, xmm2, xmm2, 16 + vpclmulqdq xmm6, xmm2, xmm2, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm3, xmm15, xmm14 + vpclmulqdq xmm14, xmm2, xmm1, 0 + vpclmulqdq xmm15, xmm2, xmm1, 17 + vpclmulqdq xmm13, xmm2, xmm1, 16 + vpclmulqdq xmm6, xmm2, xmm1, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm12, xmm15, xmm14 + vinserti128 ymm3, ymm3, xmm12, 1 + vpclmulqdq xmm14, xmm3, xmm2, 0 + vpclmulqdq xmm15, xmm3, xmm2, 17 + vpclmulqdq xmm13, xmm3, xmm2, 16 + vpclmulqdq xmm6, xmm3, xmm2, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm4, xmm15, xmm14 + vpclmulqdq xmm14, xmm3, xmm1, 0 + vpclmulqdq xmm15, xmm3, xmm1, 17 + vpclmulqdq xmm13, xmm3, xmm1, 16 + vpclmulqdq xmm6, xmm3, xmm1, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm13, xmm15, xmm14 + vinserti128 ymm4, ymm4, xmm13, 1 + vpclmulqdq xmm14, xmm3, xmm3, 0 + vpclmulqdq xmm15, xmm3, xmm3, 17 + vpclmulqdq xmm13, xmm3, xmm3, 16 + vpclmulqdq xmm6, xmm3, xmm3, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm5, xmm15, xmm14 + vpclmulqdq xmm14, xmm3, xmm12, 0 + vpclmulqdq xmm15, xmm3, xmm12, 17 + vpclmulqdq xmm13, xmm3, xmm12, 16 + vpclmulqdq xmm6, xmm3, xmm12, 1 + vpxor xmm13, xmm13, xmm6 + vpslldq xmm6, xmm13, 8 + vpsrldq xmm13, xmm13, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm13 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm13, xmm15, xmm14 + vinserti128 ymm5, ymm5, xmm13, 1 + mov r10d, r9d + shl r10d, 4 + xor eax, eax + mov r11d, r10d + and r11d, 4294967168 + cmp eax, r11d + je L_AES_GCMSIV_polyval_vaes_eight_done +L_AES_GCMSIV_polyval_vaes_eight: + lea r12, QWORD PTR [r8+rax] + vmovdqu ymm6, YMMWORD PTR [r12] + vmovdqu ymm7, YMMWORD PTR [r12+32] + vmovdqu ymm8, YMMWORD PTR [r12+64] + vmovdqu ymm9, YMMWORD PTR [r12+96] + vpxor ymm12, ymm12, ymm12 + vinserti128 ymm12, ymm12, xmm0, 0 + vpxor ymm6, ymm6, ymm12 + vpclmulqdq ymm10, ymm6, ymm5, 0 + vpclmulqdq ymm11, ymm6, ymm5, 17 + vpclmulqdq ymm14, ymm6, ymm5, 16 + vpclmulqdq ymm15, ymm6, ymm5, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm10, ymm10, ymm15 + vpxor ymm11, ymm11, ymm14 + vpclmulqdq ymm12, ymm7, ymm4, 0 + vpclmulqdq ymm13, ymm7, ymm4, 17 + vpclmulqdq ymm14, ymm7, ymm4, 16 + vpclmulqdq ymm15, ymm7, ymm4, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm12, ymm12, ymm15 + vpxor ymm13, ymm13, ymm14 + vpxor ymm10, ymm10, ymm12 + vpxor ymm11, ymm11, ymm13 + vpclmulqdq ymm12, ymm8, ymm3, 0 + vpclmulqdq ymm13, ymm8, ymm3, 17 + vpclmulqdq ymm14, ymm8, ymm3, 16 + vpclmulqdq ymm15, ymm8, ymm3, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm12, ymm12, ymm15 + vpxor ymm13, ymm13, ymm14 + vpxor ymm10, ymm10, ymm12 + vpxor ymm11, ymm11, ymm13 + vpclmulqdq ymm12, ymm9, ymm2, 0 + vpclmulqdq ymm13, ymm9, ymm2, 17 + vpclmulqdq ymm14, ymm9, ymm2, 16 + vpclmulqdq ymm15, ymm9, ymm2, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm12, ymm12, ymm15 + vpxor ymm13, ymm13, ymm14 + vpxor ymm10, ymm10, ymm12 + vpxor ymm11, ymm11, ymm13 + vextracti128 xmm12, ymm10, 1 + vpxor xmm14, xmm10, xmm12 + vextracti128 xmm12, ymm11, 1 + vpxor xmm15, xmm11, xmm12 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm0, xmm15, xmm14 + add eax, 128 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_vaes_eight +L_AES_GCMSIV_polyval_vaes_eight_done: + mov r11d, r10d + and r11d, 4294967232 + cmp eax, r11d + je L_AES_GCMSIV_polyval_vaes_four_done +L_AES_GCMSIV_polyval_vaes_four: + lea r12, QWORD PTR [r8+rax] + vmovdqu ymm6, YMMWORD PTR [r12] + vmovdqu ymm7, YMMWORD PTR [r12+32] + vpxor ymm12, ymm12, ymm12 + vinserti128 ymm12, ymm12, xmm0, 0 + vpxor ymm6, ymm6, ymm12 + vpclmulqdq ymm10, ymm6, ymm3, 0 + vpclmulqdq ymm11, ymm6, ymm3, 17 + vpclmulqdq ymm14, ymm6, ymm3, 16 + vpclmulqdq ymm15, ymm6, ymm3, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm10, ymm10, ymm15 + vpxor ymm11, ymm11, ymm14 + vpclmulqdq ymm12, ymm7, ymm2, 0 + vpclmulqdq ymm13, ymm7, ymm2, 17 + vpclmulqdq ymm14, ymm7, ymm2, 16 + vpclmulqdq ymm15, ymm7, ymm2, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm12, ymm12, ymm15 + vpxor ymm13, ymm13, ymm14 + vpxor ymm10, ymm10, ymm12 + vpxor ymm11, ymm11, ymm13 + vextracti128 xmm12, ymm10, 1 + vpxor xmm14, xmm10, xmm12 + vextracti128 xmm12, ymm11, 1 + vpxor xmm15, xmm11, xmm12 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm0, xmm15, xmm14 + add eax, 64 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_vaes_four +L_AES_GCMSIV_polyval_vaes_four_done: + mov r11d, r10d + and r11d, 4294967264 + cmp eax, r11d + je L_AES_GCMSIV_polyval_vaes_pair_done +L_AES_GCMSIV_polyval_vaes_pair: + vmovdqu ymm6, YMMWORD PTR [r8+rax] + vpxor ymm12, ymm12, ymm12 + vinserti128 ymm12, ymm12, xmm0, 0 + vpxor ymm6, ymm6, ymm12 + vpclmulqdq ymm10, ymm6, ymm2, 0 + vpclmulqdq ymm11, ymm6, ymm2, 17 + vpclmulqdq ymm14, ymm6, ymm2, 16 + vpclmulqdq ymm15, ymm6, ymm2, 1 + vpxor ymm14, ymm14, ymm15 + vpslldq ymm15, ymm14, 8 + vpsrldq ymm14, ymm14, 8 + vpxor ymm10, ymm10, ymm15 + vpxor ymm11, ymm11, ymm14 + vextracti128 xmm12, ymm10, 1 + vpxor xmm14, xmm10, xmm12 + vextracti128 xmm12, ymm11, 1 + vpxor xmm15, xmm11, xmm12 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm0, xmm15, xmm14 + add eax, 32 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_vaes_pair +L_AES_GCMSIV_polyval_vaes_pair_done: + cmp eax, r10d + je L_AES_GCMSIV_polyval_vaes_done + vpxor xmm0, xmm0, [r8+rax] + vpclmulqdq xmm14, xmm0, xmm1, 0 + vpclmulqdq xmm15, xmm0, xmm1, 17 + vpclmulqdq xmm12, xmm0, xmm1, 16 + vpclmulqdq xmm6, xmm0, xmm1, 1 + vpxor xmm12, xmm12, xmm6 + vpslldq xmm6, xmm12, 8 + vpsrldq xmm12, xmm12, 8 + vpxor xmm14, xmm14, xmm6 + vpxor xmm15, xmm15, xmm12 + vpsrld xmm9, xmm14, 31 + vpsrld xmm10, xmm15, 31 + vpslld xmm14, xmm14, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpor xmm14, xmm14, xmm9 + vpor xmm15, xmm15, xmm10 + vpor xmm15, xmm15, xmm11 + vpslld xmm9, xmm14, 31 + vpslld xmm10, xmm14, 30 + vpslld xmm11, xmm14, 25 + vpxor xmm9, xmm9, xmm10 + vpxor xmm9, xmm9, xmm11 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxor xmm14, xmm14, xmm9 + vpsrld xmm6, xmm14, 1 + vpsrld xmm7, xmm14, 2 + vpsrld xmm8, xmm14, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm8 + vpxor xmm6, xmm6, xmm10 + vpxor xmm14, xmm14, xmm6 + vpxor xmm0, xmm15, xmm14 +L_AES_GCMSIV_polyval_vaes_done: + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_vaes + vmovdqu OWORD PTR [rcx], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +AES_GCMSIV_polyval_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcmsiv_ctr_inc_vaes QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_gcmsiv_ctr_inc_vaes QWORD L_aes_gcmsiv_ctr_inc_vaes +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_ctr_vaes PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vbroadcasti128 ymm7, [r10] + vbroadcasti128 ymm8, [ptr_L_aes_gcmsiv_ctr_inc_vaes+128] + vbroadcasti128 ymm9, [ptr_L_aes_gcmsiv_ctr_inc_vaes+32] + vbroadcasti128 ymm10, [ptr_L_aes_gcmsiv_ctr_inc_vaes+16] + xor eax, eax + cmp r8d, 128 + mov r10d, r8d + jl L_AES_GCMSIV_ctr_vaes_done_128 + and r10d, 4294967168 + vpaddd ymm4, ymm7, [ptr_L_aes_gcmsiv_ctr_inc_vaes] + vpaddd ymm5, ymm7, [ptr_L_aes_gcmsiv_ctr_inc_vaes+32] + vpaddd ymm6, ymm7, [ptr_L_aes_gcmsiv_ctr_inc_vaes+64] + vpaddd ymm7, ymm7, [ptr_L_aes_gcmsiv_ctr_inc_vaes+96] +L_AES_GCMSIV_ctr_vaes_enc_128: + ; 128 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vmovdqa ymm0, ymm4 + vmovdqa ymm1, ymm5 + vmovdqa ymm2, ymm6 + vmovdqa ymm3, ymm7 + vpaddd ymm4, ymm4, ymm8 + vpaddd ymm5, ymm5, ymm8 + vpaddd ymm6, ymm6, ymm8 + vpaddd ymm7, ymm7, ymm8 + ; aes_enc_block + vbroadcasti128 ymm11, [r9] + vpxor ymm0, ymm0, ymm11 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm11 + vpxor ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+16] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+32] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+48] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+64] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+80] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+96] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+112] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+128] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+144] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + cmp eax, 11 + vbroadcasti128 ymm11, [r9+160] + jl L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+176] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + cmp eax, 13 + vbroadcasti128 ymm11, [r9+192] + jl L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+208] + vaesenc ymm0, ymm0, ymm11 + vaesenc ymm1, ymm1, ymm11 + vaesenc ymm2, ymm2, ymm11 + vaesenc ymm3, ymm3, ymm11 + vbroadcasti128 ymm11, [r9+224] +L_AES_GCMSIV_ctr_vaes_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm11 + vaesenclast ymm1, ymm1, ymm11 + vaesenclast ymm2, ymm2, ymm11 + vaesenclast ymm3, ymm3, ymm11 + vpxor ymm0, ymm0, [r11] + vpxor ymm1, ymm1, [r11+32] + vpxor ymm2, ymm2, [r11+64] + vpxor ymm3, ymm3, [r11+96] + vmovdqu YMMWORD PTR [rbx], ymm0 + vmovdqu YMMWORD PTR [rbx+32], ymm1 + vmovdqu YMMWORD PTR [rbx+64], ymm2 + vmovdqu YMMWORD PTR [rbx+96], ymm3 + add eax, 128 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_vaes_enc_128 + vperm2i128 ymm7, ymm4, ymm4, 0 +L_AES_GCMSIV_ctr_vaes_done_128: + mov r10d, r8d + and r10d, 4294967264 + cmp eax, r10d + je L_AES_GCMSIV_ctr_vaes_done_32 +L_AES_GCMSIV_ctr_vaes_enc_32: + ; 32 bytes of input + ; siv_aes_ctr_enc_32 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddd ymm0, ymm7, [ptr_L_aes_gcmsiv_ctr_inc_vaes] + vpaddd ymm7, ymm7, ymm9 + ; aes_enc_block + vbroadcasti128 ymm11, [r9] + vpxor ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+16] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+32] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+48] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+64] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+80] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+96] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+112] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+128] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+144] + vaesenc ymm0, ymm0, ymm11 + cmp eax, 11 + vbroadcasti128 ymm11, [r9+160] + jl L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+176] + vaesenc ymm0, ymm0, ymm11 + cmp eax, 13 + vbroadcasti128 ymm11, [r9+192] + jl L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+208] + vaesenc ymm0, ymm0, ymm11 + vbroadcasti128 ymm11, [r9+224] +L_AES_GCMSIV_ctr_vaes_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm11 + vpxor ymm0, ymm0, [r11] + vmovdqu YMMWORD PTR [rbx], ymm0 + add eax, 32 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_vaes_enc_32 +L_AES_GCMSIV_ctr_vaes_done_32: + cmp eax, r8d + mov r10d, r8d + je L_AES_GCMSIV_ctr_vaes_done_enc + and r10d, 4294967280 +L_AES_GCMSIV_ctr_vaes_enc_16: + ; 16 bytes of input + vmovdqa xmm0, xmm7 + vpaddd ymm7, ymm7, ymm10 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_vaes_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_vaes_enc_16 +L_AES_GCMSIV_ctr_vaes_done_enc: + vmovdqu OWORD PTR [r10], xmm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop rbx + ret +AES_GCMSIV_ctr_vaes ENDP +_TEXT ENDS +ENDIF ENDIF IFDEF HAVE_INTEL_AVX512 _DATA SEGMENT @@ -30666,5 +32492,1018 @@ L_AES_GCM_decrypt_final_avx512_cmp_tag_done: ret AES_GCM_decrypt_final_avx512 ENDP _TEXT ENDS +IFDEF WOLFSSL_AESGCM_SIV +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_siv_bswap_mask_avx512 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_gcm_siv_bswap_mask_avx512 QWORD L_aes_gcm_siv_bswap_mask_avx512 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_polyval_avx512 PROC + push r12 + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm1, OWORD PTR [rdx] + vmovdqu xmm0, OWORD PTR [rcx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_avx512 + vmovdqa64 xmm20, xmm1 + vinserti32x4 zmm2, zmm2, xmm20, 3 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm2, zmm2, xmm20, 2 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm2, zmm2, xmm20, 1 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm2, zmm2, xmm20, 0 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm3, zmm3, xmm20, 3 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm3, zmm3, xmm20, 2 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm3, zmm3, xmm20, 1 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm3, zmm3, xmm20, 0 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm4, zmm4, xmm20, 3 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm4, zmm4, xmm20, 2 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm4, zmm4, xmm20, 1 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm4, zmm4, xmm20, 0 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm5, zmm5, xmm20, 3 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm5, zmm5, xmm20, 2 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm5, zmm5, xmm20, 1 + vpclmulqdq xmm17, xmm20, xmm1, 0 + vpclmulqdq xmm18, xmm20, xmm1, 17 + vpclmulqdq xmm19, xmm20, xmm1, 16 + vpclmulqdq xmm6, xmm20, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm20, xmm18, xmm17 + vinserti32x4 zmm5, zmm5, xmm20, 0 + mov r10d, r9d + shl r10d, 4 + xor eax, eax + mov r11d, r10d + and r11d, 4294967040 + cmp eax, r11d + je L_AES_GCMSIV_polyval_avx512_sixteen_done +L_AES_GCMSIV_polyval_avx512_sixteen: + lea r12, QWORD PTR [r8+rax] + vmovdqu64 zmm6, [r12] + vmovdqu64 zmm7, [r12+64] + vmovdqu64 zmm8, [r12+128] + vmovdqu64 zmm9, [r12+192] + vpxorq zmm16, zmm16, zmm16 + vinserti32x4 zmm16, zmm16, xmm0, 0 + vpxorq zmm6, zmm6, zmm16 + vpclmulqdq zmm10, zmm6, zmm5, 0 + vpclmulqdq zmm11, zmm6, zmm5, 17 + vpclmulqdq zmm14, zmm6, zmm5, 16 + vpclmulqdq zmm15, zmm6, zmm5, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm10, zmm10, zmm15 + vpxorq zmm11, zmm11, zmm14 + vpclmulqdq zmm12, zmm7, zmm4, 0 + vpclmulqdq zmm13, zmm7, zmm4, 17 + vpclmulqdq zmm14, zmm7, zmm4, 16 + vpclmulqdq zmm15, zmm7, zmm4, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm12, zmm12, zmm15 + vpxorq zmm13, zmm13, zmm14 + vpxorq zmm10, zmm10, zmm12 + vpxorq zmm11, zmm11, zmm13 + vpclmulqdq zmm12, zmm8, zmm3, 0 + vpclmulqdq zmm13, zmm8, zmm3, 17 + vpclmulqdq zmm14, zmm8, zmm3, 16 + vpclmulqdq zmm15, zmm8, zmm3, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm12, zmm12, zmm15 + vpxorq zmm13, zmm13, zmm14 + vpxorq zmm10, zmm10, zmm12 + vpxorq zmm11, zmm11, zmm13 + vpclmulqdq zmm12, zmm9, zmm2, 0 + vpclmulqdq zmm13, zmm9, zmm2, 17 + vpclmulqdq zmm14, zmm9, zmm2, 16 + vpclmulqdq zmm15, zmm9, zmm2, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm12, zmm12, zmm15 + vpxorq zmm13, zmm13, zmm14 + vpxorq zmm10, zmm10, zmm12 + vpxorq zmm11, zmm11, zmm13 + vextracti32x4 xmm19, zmm10, 1 + vextracti32x4 xmm21, zmm10, 2 + vextracti32x4 xmm22, zmm10, 3 + vpxorq xmm17, xmm10, xmm19 + vpternlogq xmm17, xmm22, xmm21, 150 + vextracti32x4 xmm19, zmm11, 1 + vextracti32x4 xmm21, zmm11, 2 + vextracti32x4 xmm22, zmm11, 3 + vpxorq xmm18, xmm11, xmm19 + vpternlogq xmm18, xmm22, xmm21, 150 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm0, xmm18, xmm17 + add eax, 256 + cmp eax, r11d + jl L_AES_GCMSIV_polyval_avx512_sixteen +L_AES_GCMSIV_polyval_avx512_sixteen_done: + mov r11d, r10d + and r11d, 4294967168 + cmp eax, r11d + jge L_AES_GCMSIV_polyval_avx512_eight_done + lea r12, QWORD PTR [r8+rax] + vmovdqu64 zmm6, [r12] + vmovdqu64 zmm7, [r12+64] + vpxorq zmm16, zmm16, zmm16 + vinserti32x4 zmm16, zmm16, xmm0, 0 + vpxorq zmm6, zmm6, zmm16 + vpclmulqdq zmm10, zmm6, zmm3, 0 + vpclmulqdq zmm11, zmm6, zmm3, 17 + vpclmulqdq zmm14, zmm6, zmm3, 16 + vpclmulqdq zmm15, zmm6, zmm3, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm10, zmm10, zmm15 + vpxorq zmm11, zmm11, zmm14 + vpclmulqdq zmm12, zmm7, zmm2, 0 + vpclmulqdq zmm13, zmm7, zmm2, 17 + vpclmulqdq zmm14, zmm7, zmm2, 16 + vpclmulqdq zmm15, zmm7, zmm2, 1 + vpxorq zmm14, zmm14, zmm15 + vpslldq zmm15, zmm14, 8 + vpsrldq zmm14, zmm14, 8 + vpxorq zmm12, zmm12, zmm15 + vpxorq zmm13, zmm13, zmm14 + vpxorq zmm10, zmm10, zmm12 + vpxorq zmm11, zmm11, zmm13 + vextracti32x4 xmm19, zmm10, 1 + vextracti32x4 xmm21, zmm10, 2 + vextracti32x4 xmm22, zmm10, 3 + vpxorq xmm17, xmm10, xmm19 + vpternlogq xmm17, xmm22, xmm21, 150 + vextracti32x4 xmm19, zmm11, 1 + vextracti32x4 xmm21, zmm11, 2 + vextracti32x4 xmm22, zmm11, 3 + vpxorq xmm18, xmm11, xmm19 + vpternlogq xmm18, xmm22, xmm21, 150 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm0, xmm18, xmm17 + add eax, 128 +L_AES_GCMSIV_polyval_avx512_eight_done: +L_AES_GCMSIV_polyval_avx512_rem: + cmp eax, r10d + jge L_AES_GCMSIV_polyval_avx512_done + vpxorq xmm0, xmm0, [r8+rax] + vpclmulqdq xmm17, xmm0, xmm1, 0 + vpclmulqdq xmm18, xmm0, xmm1, 17 + vpclmulqdq xmm19, xmm0, xmm1, 16 + vpclmulqdq xmm6, xmm0, xmm1, 1 + vpxorq xmm19, xmm19, xmm6 + vpslldq xmm6, xmm19, 8 + vpsrldq xmm19, xmm19, 8 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm18, xmm18, xmm19 + vpsrld xmm9, xmm17, 31 + vpsrld xmm10, xmm18, 31 + vpslld xmm17, xmm17, 1 + vpslld xmm18, xmm18, 1 + vpsrldq xmm11, xmm9, 12 + vpslldq xmm10, xmm10, 4 + vpslldq xmm9, xmm9, 4 + vpxorq xmm17, xmm17, xmm9 + vpternlogq xmm18, xmm11, xmm10, 150 + vpslld xmm9, xmm17, 31 + vpslld xmm10, xmm17, 30 + vpslld xmm11, xmm17, 25 + vpternlogq xmm9, xmm11, xmm10, 150 + vpsrldq xmm10, xmm9, 4 + vpslldq xmm9, xmm9, 12 + vpxorq xmm17, xmm17, xmm9 + vpsrld xmm6, xmm17, 1 + vpsrld xmm7, xmm17, 2 + vpsrld xmm8, xmm17, 7 + vpternlogq xmm6, xmm8, xmm7, 150 + vpxorq xmm6, xmm6, xmm10 + vpxorq xmm17, xmm17, xmm6 + vpxorq xmm0, xmm18, xmm17 + add eax, 16 + jmp L_AES_GCMSIV_polyval_avx512_rem +L_AES_GCMSIV_polyval_avx512_done: + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_siv_bswap_mask_avx512 + vmovdqu OWORD PTR [rcx], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +AES_GCMSIV_polyval_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcmsiv_ctr_inc_avx512 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_gcmsiv_ctr_inc_avx512 QWORD L_aes_gcmsiv_ctr_inc_avx512 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCMSIV_ctr_avx512 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vbroadcasti32x4 zmm7, [r10] + vbroadcasti32x4 zmm8, [ptr_L_aes_gcmsiv_ctr_inc_avx512+256] + vbroadcasti32x4 zmm9, [ptr_L_aes_gcmsiv_ctr_inc_avx512+64] + vbroadcasti32x4 zmm10, [ptr_L_aes_gcmsiv_ctr_inc_avx512+16] + xor eax, eax + cmp r8d, 64 + jl L_AES_GCMSIV_ctr_avx512_done_64 + vbroadcasti32x4 zmm12, [r9] + vbroadcasti32x4 zmm13, [r9+16] + vbroadcasti32x4 zmm14, [r9+32] + vbroadcasti32x4 zmm15, [r9+48] + vbroadcasti32x4 zmm16, [r9+64] + vbroadcasti32x4 zmm17, [r9+80] + vbroadcasti32x4 zmm18, [r9+96] + vbroadcasti32x4 zmm19, [r9+112] + vbroadcasti32x4 zmm20, [r9+128] + vbroadcasti32x4 zmm21, [r9+144] + vbroadcasti32x4 zmm22, [r9+160] + cmp eax, 11 + jl L_AES_GCMSIV_ctr_avx512_key_cached + vbroadcasti32x4 zmm23, [r9+176] + vbroadcasti32x4 zmm24, [r9+192] + cmp eax, 13 + jl L_AES_GCMSIV_ctr_avx512_key_cached + vbroadcasti32x4 zmm25, [r9+208] + vbroadcasti32x4 zmm26, [r9+224] +L_AES_GCMSIV_ctr_avx512_key_cached: + cmp r8d, 256 + mov r10d, r8d + jl L_AES_GCMSIV_ctr_avx512_done_256 + and r10d, 4294967040 + vpaddd zmm4, zmm7, [ptr_L_aes_gcmsiv_ctr_inc_avx512] + vpaddd zmm5, zmm7, [ptr_L_aes_gcmsiv_ctr_inc_avx512+64] + vpaddd zmm6, zmm7, [ptr_L_aes_gcmsiv_ctr_inc_avx512+128] + vpaddd zmm7, zmm7, [ptr_L_aes_gcmsiv_ctr_inc_avx512+192] +L_AES_GCMSIV_ctr_avx512_enc_256: + ; 256 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vmovdqa64 zmm0, zmm4 + vmovdqa64 zmm1, zmm5 + vmovdqa64 zmm2, zmm6 + vmovdqa64 zmm3, zmm7 + vpaddd zmm4, zmm4, zmm8 + vpaddd zmm5, zmm5, zmm8 + vpaddd zmm6, zmm6, zmm8 + vpaddd zmm7, zmm7, zmm8 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm12 + vpxorq zmm1, zmm1, zmm12 + vpxorq zmm2, zmm2, zmm12 + vpxorq zmm3, zmm3, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm1, zmm1, zmm13 + vaesenc zmm2, zmm2, zmm13 + vaesenc zmm3, zmm3, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm1, zmm1, zmm14 + vaesenc zmm2, zmm2, zmm14 + vaesenc zmm3, zmm3, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm1, zmm1, zmm15 + vaesenc zmm2, zmm2, zmm15 + vaesenc zmm3, zmm3, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm1, zmm1, zmm16 + vaesenc zmm2, zmm2, zmm16 + vaesenc zmm3, zmm3, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + cmp eax, 11 + vmovdqa64 zmm11, zmm22 + jl L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + cmp eax, 13 + vmovdqa64 zmm11, zmm24 + jl L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + vmovdqa64 zmm11, zmm26 +L_AES_GCMSIV_ctr_avx512_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm11 + vaesenclast zmm1, zmm1, zmm11 + vaesenclast zmm2, zmm2, zmm11 + vaesenclast zmm3, zmm3, zmm11 + vpxorq zmm0, zmm0, [r11] + vpxorq zmm1, zmm1, [r11+64] + vpxorq zmm2, zmm2, [r11+128] + vpxorq zmm3, zmm3, [r11+192] + vmovdqu64 [rbx], zmm0 + vmovdqu64 [rbx+64], zmm1 + vmovdqu64 [rbx+128], zmm2 + vmovdqu64 [rbx+192], zmm3 + add eax, 256 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_avx512_enc_256 + vshufi64x2 zmm7, zmm4, zmm4, 0 +L_AES_GCMSIV_ctr_avx512_done_256: + mov r10d, r8d + and r10d, 4294967232 + cmp eax, r10d + je L_AES_GCMSIV_ctr_avx512_done_64 +L_AES_GCMSIV_ctr_avx512_enc_64: + ; 64 bytes of input + ; siv_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddd zmm0, zmm7, [ptr_L_aes_gcmsiv_ctr_inc_avx512] + vpaddd zmm7, zmm7, zmm9 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + cmp eax, 11 + vmovdqa64 zmm11, zmm22 + jl L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + cmp eax, 13 + vmovdqa64 zmm11, zmm24 + jl L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + vmovdqa64 zmm11, zmm26 +L_AES_GCMSIV_ctr_avx512_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm11 + vpxorq zmm0, zmm0, [r11] + vmovdqu64 [rbx], zmm0 + add eax, 64 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_avx512_enc_64 +L_AES_GCMSIV_ctr_avx512_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_GCMSIV_ctr_avx512_done_enc + and r10d, 4294967280 +L_AES_GCMSIV_ctr_avx512_enc_16: + ; 16 bytes of input + vmovdqa xmm0, xmm7 + vpaddd zmm7, zmm7, zmm10 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_GCMSIV_ctr_avx512_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_GCMSIV_ctr_avx512_enc_16 +L_AES_GCMSIV_ctr_avx512_done_enc: + vmovdqu OWORD PTR [r10], xmm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop rbx + ret +AES_GCMSIV_ctr_avx512 ENDP +_TEXT ENDS +ENDIF ENDIF END diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 8963abb49a..f079a2bbaf 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -53,6 +53,8 @@ new_cpuid_flags |= CPUID_ADX; new_cpuid_flags |= CPUID_MOVBE; new_cpuid_flags |= CPUID_BMI1; + new_cpuid_flags |= CPUID_VAES; + new_cpuid_flags |= CPUID_AVX512; (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index 2112845ce0..0c0cc8cc5d 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -7818,6 +7818,325 @@ L_aes_xts_decrypt_arm32_crypto_done: .size AES_XTS_decrypt_AARCH32,.-AES_XTS_decrypt_AARCH32 #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV + .text + .align 4 + .globl AES_GCMSIV_polyval_crypto + .type AES_GCMSIV_polyval_crypto, %function +AES_GCMSIV_polyval_crypto: + vpush {d8-d15} + veor.8 q2, q2, q2 + vld1.8 {q1}, [r1] + vld1.8 {q0}, [r0] + vrev64.8 q0, q0 + vext.8 q0, q0, q0, #8 + vmull.p64 q7, d2, d2 + vmull.p64 q11, d2, d3 + vmull.p64 q12, d3, d2 + vmull.p64 q8, d3, d3 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + vshr.u32 q6, q7, #31 + vshr.u32 q9, q8, #31 + vshl.i32 q7, q7, #1 + vshl.i32 q8, q8, #1 + vext.8 q10, q6, q2, #12 + vext.8 q9, q2, q9, #12 + vext.8 q6, q2, q6, #12 + veor.8 q7, q7, q6 + veor.8 q8, q8, q9 + veor.8 q8, q8, q10 + vshl.i32 q6, q7, #31 + vshl.i32 q9, q7, #30 + vshl.i32 q10, q7, #25 + veor.8 q6, q6, q9 + veor.8 q6, q6, q10 + vext.8 q9, q6, q2, #4 + vext.8 q6, q2, q6, #4 + veor.8 q7, q7, q6 + vshr.u32 q11, q7, #1 + vshr.u32 q12, q7, #2 + vshr.u32 q13, q7, #7 + veor.8 q11, q11, q12 + veor.8 q11, q11, q13 + veor.8 q11, q11, q9 + veor.8 q7, q7, q11 + veor.8 q3, q8, q7 + vmull.p64 q7, d6, d2 + vmull.p64 q11, d6, d3 + vmull.p64 q12, d7, d2 + vmull.p64 q8, d7, d3 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + vshr.u32 q6, q7, #31 + vshr.u32 q9, q8, #31 + vshl.i32 q7, q7, #1 + vshl.i32 q8, q8, #1 + vext.8 q10, q6, q2, #12 + vext.8 q9, q2, q9, #12 + vext.8 q6, q2, q6, #12 + veor.8 q7, q7, q6 + veor.8 q8, q8, q9 + veor.8 q8, q8, q10 + vshl.i32 q6, q7, #31 + vshl.i32 q9, q7, #30 + vshl.i32 q10, q7, #25 + veor.8 q6, q6, q9 + veor.8 q6, q6, q10 + vext.8 q9, q6, q2, #4 + vext.8 q6, q2, q6, #4 + veor.8 q7, q7, q6 + vshr.u32 q11, q7, #1 + vshr.u32 q12, q7, #2 + vshr.u32 q13, q7, #7 + veor.8 q11, q11, q12 + veor.8 q11, q11, q13 + veor.8 q11, q11, q9 + veor.8 q7, q7, q11 + veor.8 q4, q8, q7 + vmull.p64 q7, d6, d6 + vmull.p64 q11, d6, d7 + vmull.p64 q12, d7, d6 + vmull.p64 q8, d7, d7 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + vshr.u32 q6, q7, #31 + vshr.u32 q9, q8, #31 + vshl.i32 q7, q7, #1 + vshl.i32 q8, q8, #1 + vext.8 q10, q6, q2, #12 + vext.8 q9, q2, q9, #12 + vext.8 q6, q2, q6, #12 + veor.8 q7, q7, q6 + veor.8 q8, q8, q9 + veor.8 q8, q8, q10 + vshl.i32 q6, q7, #31 + vshl.i32 q9, q7, #30 + vshl.i32 q10, q7, #25 + veor.8 q6, q6, q9 + veor.8 q6, q6, q10 + vext.8 q9, q6, q2, #4 + vext.8 q6, q2, q6, #4 + veor.8 q7, q7, q6 + vshr.u32 q11, q7, #1 + vshr.u32 q12, q7, #2 + vshr.u32 q13, q7, #7 + veor.8 q11, q11, q12 + veor.8 q11, q11, q13 + veor.8 q11, q11, q9 + veor.8 q7, q7, q11 + veor.8 q5, q8, q7 + lsr r12, r3, #2 + cmp r12, #0 + beq L_AES_GCMSIV_polyval_crypto_rem_start +L_AES_GCMSIV_polyval_crypto_group: + vld1.8 {q6}, [r2]! + veor.8 q6, q6, q0 + vmull.p64 q9, d12, d10 + vmull.p64 q11, d12, d11 + vmull.p64 q12, d13, d10 + vmull.p64 q10, d13, d11 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q9, q9, q12 + veor.8 q10, q10, q11 + vld1.8 {q6}, [r2]! + vmull.p64 q7, d12, d8 + vmull.p64 q11, d12, d9 + vmull.p64 q12, d13, d8 + vmull.p64 q8, d13, d9 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + veor.8 q9, q9, q7 + veor.8 q10, q10, q8 + vld1.8 {q6}, [r2]! + vmull.p64 q7, d12, d6 + vmull.p64 q11, d12, d7 + vmull.p64 q12, d13, d6 + vmull.p64 q8, d13, d7 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + veor.8 q9, q9, q7 + veor.8 q10, q10, q8 + vld1.8 {q6}, [r2]! + vmull.p64 q7, d12, d2 + vmull.p64 q11, d12, d3 + vmull.p64 q12, d13, d2 + vmull.p64 q8, d13, d3 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q7, q7, q12 + veor.8 q8, q8, q11 + veor.8 q9, q9, q7 + veor.8 q10, q10, q8 + vshr.u32 q6, q9, #31 + vshr.u32 q7, q10, #31 + vshl.i32 q9, q9, #1 + vshl.i32 q10, q10, #1 + vext.8 q8, q6, q2, #12 + vext.8 q7, q2, q7, #12 + vext.8 q6, q2, q6, #12 + veor.8 q9, q9, q6 + veor.8 q10, q10, q7 + veor.8 q10, q10, q8 + vshl.i32 q6, q9, #31 + vshl.i32 q7, q9, #30 + vshl.i32 q8, q9, #25 + veor.8 q6, q6, q7 + veor.8 q6, q6, q8 + vext.8 q7, q6, q2, #4 + vext.8 q6, q2, q6, #4 + veor.8 q9, q9, q6 + vshr.u32 q11, q9, #1 + vshr.u32 q12, q9, #2 + vshr.u32 q13, q9, #7 + veor.8 q11, q11, q12 + veor.8 q11, q11, q13 + veor.8 q11, q11, q7 + veor.8 q9, q9, q11 + veor.8 q0, q10, q9 + subs r12, r12, #1 + bne L_AES_GCMSIV_polyval_crypto_group +L_AES_GCMSIV_polyval_crypto_rem_start: + and r3, r3, #3 + cmp r3, #0 + beq L_AES_GCMSIV_polyval_crypto_done +L_AES_GCMSIV_polyval_crypto_rem: + vld1.8 {q6}, [r2]! + veor.8 q0, q0, q6 + vmull.p64 q9, d0, d2 + vmull.p64 q11, d0, d3 + vmull.p64 q12, d1, d2 + vmull.p64 q10, d1, d3 + veor.8 q11, q11, q12 + vext.8 q12, q2, q11, #8 + vext.8 q11, q11, q2, #8 + veor.8 q9, q9, q12 + veor.8 q10, q10, q11 + vshr.u32 q6, q9, #31 + vshr.u32 q7, q10, #31 + vshl.i32 q9, q9, #1 + vshl.i32 q10, q10, #1 + vext.8 q8, q6, q2, #12 + vext.8 q7, q2, q7, #12 + vext.8 q6, q2, q6, #12 + veor.8 q9, q9, q6 + veor.8 q10, q10, q7 + veor.8 q10, q10, q8 + vshl.i32 q6, q9, #31 + vshl.i32 q7, q9, #30 + vshl.i32 q8, q9, #25 + veor.8 q6, q6, q7 + veor.8 q6, q6, q8 + vext.8 q7, q6, q2, #4 + vext.8 q6, q2, q6, #4 + veor.8 q9, q9, q6 + vshr.u32 q11, q9, #1 + vshr.u32 q12, q9, #2 + vshr.u32 q13, q9, #7 + veor.8 q11, q11, q12 + veor.8 q11, q11, q13 + veor.8 q11, q11, q7 + veor.8 q9, q9, q11 + veor.8 q0, q10, q9 + subs r3, r3, #1 + bne L_AES_GCMSIV_polyval_crypto_rem +L_AES_GCMSIV_polyval_crypto_done: + vrev64.8 q0, q0 + vext.8 q0, q0, q0, #8 + vst1.8 {q0}, [r0] + vpop {d8-d15} + bx lr + .size AES_GCMSIV_polyval_crypto,.-AES_GCMSIV_polyval_crypto + .text + .align 4 + .globl AES_GCMSIV_ctr_crypto + .type AES_GCMSIV_ctr_crypto, %function +AES_GCMSIV_ctr_crypto: + push {r4, r5, r6, r7, r8, r9, r10, lr} + vpush {d8-d11} + ldr r12, [sp, #64] + ldr lr, [sp, #68] + vld1.8 {q0}, [lr] + lsr r5, r2, #4 + vmov r7, r8, d0 + vmov r9, r10, d1 +L_AES_GCMSIV_ctr_crypto_loop2: + cmp r5, #2 + blt L_AES_GCMSIV_ctr_crypto_tail + vmov d2, r7, r8 + vmov d3, r9, r10 + add r2, r7, #1 + vmov d4, r2, r8 + vmov d5, r9, r10 + add r7, r7, #2 + mov r4, r3 + sub r6, r12, #1 +L_AES_GCMSIV_ctr_crypto_rounds2: + vld1.32 {q5}, [r4]! + aese.8 q1, q5 + aesmc.8 q1, q1 + aese.8 q2, q5 + aesmc.8 q2, q2 + subs r6, r6, #1 + bne L_AES_GCMSIV_ctr_crypto_rounds2 + vld1.32 {q5}, [r4]! + aese.8 q1, q5 + aese.8 q2, q5 + vld1.32 {q5}, [r4] + veor.8 q1, q1, q5 + veor.8 q2, q2, q5 + vld1.8 {q3-q4}, [r0]! + veor.8 q3, q3, q1 + veor.8 q4, q4, q2 + vst1.8 {q3-q4}, [r1]! + sub r5, r5, #2 + b L_AES_GCMSIV_ctr_crypto_loop2 +L_AES_GCMSIV_ctr_crypto_tail: + cmp r5, #0 + beq L_AES_GCMSIV_ctr_crypto_done + vmov d2, r7, r8 + vmov d3, r9, r10 + add r7, r7, #1 + mov r4, r3 + sub r6, r12, #1 +L_AES_GCMSIV_ctr_crypto_rounds1: + vld1.32 {q5}, [r4]! + aese.8 q1, q5 + aesmc.8 q1, q1 + subs r6, r6, #1 + bne L_AES_GCMSIV_ctr_crypto_rounds1 + vld1.32 {q5}, [r4]! + aese.8 q1, q5 + vld1.32 {q5}, [r4] + veor.8 q1, q1, q5 + vld1.8 {q3}, [r0]! + veor.8 q3, q3, q1 + vst1.8 {q3}, [r1]! +L_AES_GCMSIV_ctr_crypto_done: + vmov d0, r7, r8 + vst1.8 {q0}, [lr] + vpop {d8-d11} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size AES_GCMSIV_ctr_crypto,.-AES_GCMSIV_ctr_crypto +#endif /* WOLFSSL_AESGCM_SIV */ #else #ifdef HAVE_AES_DECRYPT #ifndef __APPLE__ @@ -25989,6 +26308,3127 @@ L_AES_GCM_encrypt_end: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_GCM_encrypt,.-AES_GCM_encrypt #endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_polyval_base_r, %object + .size L_AES_GCMSIV_polyval_base_r, 64 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_polyval_base_r: + .long 0x00000000,0x1c200000,0x38400000,0x24600000 + .long 0x70800000,0x6ca00000,0x48c00000,0x54e00000 + .long 0xe1000000,0xfd200000,0xd9400000,0xc5600000 + .long 0x91800000,0x8da00000,0xa9c00000,0xb5e00000 + .text + .align 4 + .globl AES_GCMSIV_polyval_base + .type AES_GCMSIV_polyval_base, %function +AES_GCMSIV_polyval_base: + push {r4, r5, r6, r7, r8, r9, r10, lr} + adr r6, L_AES_GCMSIV_polyval_base_r + cmp r3, #0 + beq L_AES_GCMSIV_polyval_base_done +L_AES_GCMSIV_polyval_base_loop: + ldr r10, [r2, #12] + rev r10, r10 + ldr r8, [r0] + eor r8, r8, r10 + str r8, [r0] + ldr r10, [r2, #8] + rev r10, r10 + ldr r8, [r0, #4] + eor r8, r8, r10 + str r8, [r0, #4] + ldr r10, [r2, #4] + rev r10, r10 + ldr r8, [r0, #8] + eor r8, r8, r10 + str r8, [r0, #8] + ldr r10, [r2] + rev r10, r10 + ldr r8, [r0, #12] + eor r8, r8, r10 + str r8, [r0, #12] + mov r12, #0 + mov lr, #0 + mov r4, #0 + mov r5, #0 + ldr r7, [r0, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #4 + lsr r8, r8, #28 +#else + ubfx r8, r7, #24, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsr r8, r7, #28 +#else + ubfx r8, r7, #28, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #12 + lsr r8, r8, #28 +#else + ubfx r8, r7, #16, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #8 + lsr r8, r8, #28 +#else + ubfx r8, r7, #20, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #20 + lsr r8, r8, #28 +#else + ubfx r8, r7, #8, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #16 + lsr r8, r8, #28 +#else + ubfx r8, r7, #12, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #28 + lsr r8, r8, #28 +#else + ubfx r8, r7, #0, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #24 + lsr r8, r8, #28 +#else + ubfx r8, r7, #4, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 + ldr r7, [r0, #8] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #4 + lsr r8, r8, #28 +#else + ubfx r8, r7, #24, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsr r8, r7, #28 +#else + ubfx r8, r7, #28, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #12 + lsr r8, r8, #28 +#else + ubfx r8, r7, #16, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #8 + lsr r8, r8, #28 +#else + ubfx r8, r7, #20, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #20 + lsr r8, r8, #28 +#else + ubfx r8, r7, #8, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #16 + lsr r8, r8, #28 +#else + ubfx r8, r7, #12, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #28 + lsr r8, r8, #28 +#else + ubfx r8, r7, #0, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #24 + lsr r8, r8, #28 +#else + ubfx r8, r7, #4, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 + ldr r7, [r0, #4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #4 + lsr r8, r8, #28 +#else + ubfx r8, r7, #24, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsr r8, r7, #28 +#else + ubfx r8, r7, #28, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #12 + lsr r8, r8, #28 +#else + ubfx r8, r7, #16, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #8 + lsr r8, r8, #28 +#else + ubfx r8, r7, #20, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #20 + lsr r8, r8, #28 +#else + ubfx r8, r7, #8, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #16 + lsr r8, r8, #28 +#else + ubfx r8, r7, #12, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #28 + lsr r8, r8, #28 +#else + ubfx r8, r7, #0, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #24 + lsr r8, r8, #28 +#else + ubfx r8, r7, #4, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 + ldr r7, [r0] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #4 + lsr r8, r8, #28 +#else + ubfx r8, r7, #24, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsr r8, r7, #28 +#else + ubfx r8, r7, #28, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #12 + lsr r8, r8, #28 +#else + ubfx r8, r7, #16, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #8 + lsr r8, r8, #28 +#else + ubfx r8, r7, #20, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #20 + lsr r8, r8, #28 +#else + ubfx r8, r7, #8, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #16 + lsr r8, r8, #28 +#else + ubfx r8, r7, #12, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #28 + lsr r8, r8, #28 +#else + ubfx r8, r7, #0, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + and r8, r4, #15 + lsr r4, r4, #4 + orr r4, r4, r5, lsl #28 + lsr r5, r5, #4 + orr r5, r5, r12, lsl #28 + lsr r12, r12, #4 + orr r12, r12, lr, lsl #28 + lsr lr, lr, #4 + ldr r10, [r6, r8, lsl #2] + eor lr, lr, r10 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r8, r7, #24 + lsr r8, r8, #28 +#else + ubfx r8, r7, #4, #4 +#endif + add r9, r1, r8, lsl #4 + ldr r10, [r9] + eor r12, r12, r10 + ldr r10, [r9, #4] + eor lr, lr, r10 + ldr r10, [r9, #8] + eor r4, r4, r10 + ldr r10, [r9, #12] + eor r5, r5, r10 + rev lr, lr + rev r12, r12 + rev r5, r5 + rev r4, r4 + str lr, [r0] + str r12, [r0, #4] + str r5, [r0, #8] + str r4, [r0, #12] + subs r3, r3, #1 + add r2, r2, #16 + bne L_AES_GCMSIV_polyval_base_loop +L_AES_GCMSIV_polyval_base_done: + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size AES_GCMSIV_polyval_base,.-AES_GCMSIV_polyval_base +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_ctr_base_te_data, %object + .size L_AES_GCMSIV_ctr_base_te_data, 1024 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_base_te_data: + .long 0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b + .long 0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5 + .long 0x50603030,0x03020101,0xa9ce6767,0x7d562b2b + .long 0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676 + .long 0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d + .long 0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0 + .long 0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf + .long 0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0 + .long 0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626 + .long 0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc + .long 0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1 + .long 0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515 + .long 0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3 + .long 0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a + .long 0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2 + .long 0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575 + .long 0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a + .long 0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0 + .long 0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3 + .long 0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484 + .long 0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded + .long 0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b + .long 0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939 + .long 0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf + .long 0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb + .long 0xc5864343,0xd79a4d4d,0x55663333,0x94118585 + .long 0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f + .long 0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8 + .long 0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f + .long 0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5 + .long 0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121 + .long 0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2 + .long 0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec + .long 0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717 + .long 0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d + .long 0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373 + .long 0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc + .long 0x66442222,0x7e542a2a,0xab3b9090,0x830b8888 + .long 0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414 + .long 0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb + .long 0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a + .long 0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c + .long 0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262 + .long 0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979 + .long 0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d + .long 0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9 + .long 0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea + .long 0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808 + .long 0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e + .long 0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6 + .long 0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f + .long 0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a + .long 0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666 + .long 0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e + .long 0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9 + .long 0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e + .long 0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111 + .long 0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494 + .long 0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9 + .long 0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf + .long 0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d + .long 0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868 + .long 0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f + .long 0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616 +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_ctr_base_te, %object + .size L_AES_GCMSIV_ctr_base_te, 12 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_base_te: + .long L_AES_GCMSIV_ctr_base_te_data + .text + .align 4 + .globl AES_GCMSIV_ctr_base + .type AES_GCMSIV_ctr_base, %function +AES_GCMSIV_ctr_base: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ldr r12, [sp, #36] + ldr r8, [sp, #40] + mov lr, r0 + adr r0, L_AES_GCMSIV_ctr_base_te + ldr r0, [r0] + ldm r8, {r4, r5, r6, r7} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + stm r8, {r4, r5, r6, r7} + push {r3, r8} + cmp r12, #10 + beq L_AES_GCMSIV_ctr_base_start_block_128 + cmp r12, #12 + beq L_AES_GCMSIV_ctr_base_start_block_192 +L_AES_GCMSIV_ctr_base_loop_block_256: + push {r1, r2, lr} + ldr lr, [sp, #16] + rev r8, r4 + add r8, r8, #1 + rev r8, r8 + mov r9, r5 + mov r10, r6 + mov r11, r7 + stm lr, {r8, r9, r10, r11} + ldm r3!, {r8, r9, r10, r11} + # Round: 0 - XOR in key schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + mov r1, #6 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + bl AES_encrypt_block +#else +L_AES_GCMSIV_ctr_base_block_nr_256: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r9, #8 + lsr r4, r4, #24 +#else + uxtb r4, r9, ror #16 +#endif +#else + ubfx r4, r9, #16, #8 +#endif + lsr r7, r8, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #16 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #8 +#endif +#else + ubfx lr, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r11, #24 + lsr r2, r2, #24 +#else + uxtb r2, r11 +#endif +#else + ubfx r2, r11, #0, #8 +#endif + ldr r4, [r0, r4, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r10, #8 + lsr r5, r5, #24 +#else + uxtb r5, r10, ror #16 +#endif +#else + ubfx r5, r10, #16, #8 +#endif + eor r4, r4, r7, ror #24 + lsr r7, r9, #24 + eor r4, r4, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #16 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #8 +#endif +#else + ubfx lr, r11, #8, #8 +#endif + eor r4, r4, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #24 + lsr r2, r2, #24 +#else + uxtb r2, r8 +#endif +#else + ubfx r2, r8, #0, #8 +#endif + ldr r5, [r0, r5, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r11, #8 + lsr r6, r6, #24 +#else + uxtb r6, r11, ror #16 +#endif +#else + ubfx r6, r11, #16, #8 +#endif + eor r5, r5, r7, ror #24 + lsr r7, r10, #24 + eor r5, r5, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r8, #16 + lsr lr, lr, #24 +#else + uxtb lr, r8, ror #8 +#endif +#else + ubfx lr, r8, #8, #8 +#endif + eor r5, r5, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #24 + lsr r2, r2, #24 +#else + uxtb r2, r9 +#endif +#else + ubfx r2, r9, #0, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r10, #24 + lsr r10, r10, #24 +#else + uxtb r10, r10 +#endif +#else + ubfx r10, r10, #0, #8 +#endif + eor r6, r6, r7, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #8 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #16 +#endif +#else + ubfx r7, r8, #16, #8 +#endif + eor r6, r6, lr, ror #8 + lsr lr, r11, #24 + eor r6, r6, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #16 + lsr r2, r2, #24 +#else + uxtb r2, r9, ror #8 +#endif +#else + ubfx r2, r9, #8, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r10, ror #24 + ldm r3!, {r8, r9, r10, r11} + eor r7, r7, lr, ror #24 + eor r7, r7, r2, ror #8 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + subs r1, r1, #1 + bne L_AES_GCMSIV_ctr_base_block_nr_256 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r11, #24 + lsr r4, r4, #24 +#else + uxtb r4, r11 +#endif +#else + ubfx r4, r11, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #16 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #8 +#endif +#else + ubfx r7, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #8 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #16 +#endif +#else + ubfx lr, r9, #16, #8 +#endif + lsr r2, r8, #24 + ldrb r4, [r0, r4, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r8, #24 + lsr r5, r5, #24 +#else + uxtb r5, r8 +#endif +#else + ubfx r5, r8, #0, #8 +#endif + eor r4, r4, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r11, #16 + lsr r7, r7, #24 +#else + uxtb r7, r11, ror #8 +#endif +#else + ubfx r7, r11, #8, #8 +#endif + eor r4, r4, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #8 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #16 +#endif +#else + ubfx lr, r10, #16, #8 +#endif + eor r4, r4, r2, lsl #24 + lsr r2, r9, #24 + ldrb r5, [r0, r5, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r9, #24 + lsr r6, r6, #24 +#else + uxtb r6, r9 +#endif +#else + ubfx r6, r9, #0, #8 +#endif + eor r5, r5, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #16 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #8 +#endif +#else + ubfx r7, r8, #8, #8 +#endif + eor r5, r5, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #8 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #16 +#endif +#else + ubfx lr, r11, #16, #8 +#endif + eor r5, r5, r2, lsl #24 + lsr r2, r10, #24 + ldrb r6, [r0, r6, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + lsr r11, r11, #24 + eor r6, r6, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #24 + lsr r7, r7, #24 +#else + uxtb r7, r10 +#endif +#else + ubfx r7, r10, #0, #8 +#endif + eor r6, r6, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #16 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #8 +#endif +#else + ubfx lr, r9, #8, #8 +#endif + eor r6, r6, r2, lsl #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #8 + lsr r2, r2, #24 +#else + uxtb r2, r8, ror #16 +#endif +#else + ubfx r2, r8, #16, #8 +#endif + ldrb r11, [r0, r11, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + eor lr, lr, r11, lsl #16 + ldm r3, {r8, r9, r10, r11} + eor r7, r7, lr, lsl #8 + eor r7, r7, r2, lsl #16 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + pop {r1, r2, lr} + ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldr r8, [sp, #4] + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + ldm r8, {r4, r5, r6, r7} + subs r2, r2, #16 + add lr, lr, #16 + add r1, r1, #16 + bne L_AES_GCMSIV_ctr_base_loop_block_256 + b L_AES_GCMSIV_ctr_base_end +L_AES_GCMSIV_ctr_base_start_block_192: +L_AES_GCMSIV_ctr_base_loop_block_192: + push {r1, r2, lr} + ldr lr, [sp, #16] + rev r8, r4 + add r8, r8, #1 + rev r8, r8 + mov r9, r5 + mov r10, r6 + mov r11, r7 + stm lr, {r8, r9, r10, r11} + ldm r3!, {r8, r9, r10, r11} + # Round: 0 - XOR in key schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + mov r1, #5 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + bl AES_encrypt_block +#else +L_AES_GCMSIV_ctr_base_block_nr_192: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r9, #8 + lsr r4, r4, #24 +#else + uxtb r4, r9, ror #16 +#endif +#else + ubfx r4, r9, #16, #8 +#endif + lsr r7, r8, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #16 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #8 +#endif +#else + ubfx lr, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r11, #24 + lsr r2, r2, #24 +#else + uxtb r2, r11 +#endif +#else + ubfx r2, r11, #0, #8 +#endif + ldr r4, [r0, r4, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r10, #8 + lsr r5, r5, #24 +#else + uxtb r5, r10, ror #16 +#endif +#else + ubfx r5, r10, #16, #8 +#endif + eor r4, r4, r7, ror #24 + lsr r7, r9, #24 + eor r4, r4, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #16 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #8 +#endif +#else + ubfx lr, r11, #8, #8 +#endif + eor r4, r4, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #24 + lsr r2, r2, #24 +#else + uxtb r2, r8 +#endif +#else + ubfx r2, r8, #0, #8 +#endif + ldr r5, [r0, r5, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r11, #8 + lsr r6, r6, #24 +#else + uxtb r6, r11, ror #16 +#endif +#else + ubfx r6, r11, #16, #8 +#endif + eor r5, r5, r7, ror #24 + lsr r7, r10, #24 + eor r5, r5, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r8, #16 + lsr lr, lr, #24 +#else + uxtb lr, r8, ror #8 +#endif +#else + ubfx lr, r8, #8, #8 +#endif + eor r5, r5, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #24 + lsr r2, r2, #24 +#else + uxtb r2, r9 +#endif +#else + ubfx r2, r9, #0, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r10, #24 + lsr r10, r10, #24 +#else + uxtb r10, r10 +#endif +#else + ubfx r10, r10, #0, #8 +#endif + eor r6, r6, r7, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #8 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #16 +#endif +#else + ubfx r7, r8, #16, #8 +#endif + eor r6, r6, lr, ror #8 + lsr lr, r11, #24 + eor r6, r6, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #16 + lsr r2, r2, #24 +#else + uxtb r2, r9, ror #8 +#endif +#else + ubfx r2, r9, #8, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r10, ror #24 + ldm r3!, {r8, r9, r10, r11} + eor r7, r7, lr, ror #24 + eor r7, r7, r2, ror #8 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + subs r1, r1, #1 + bne L_AES_GCMSIV_ctr_base_block_nr_192 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r11, #24 + lsr r4, r4, #24 +#else + uxtb r4, r11 +#endif +#else + ubfx r4, r11, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #16 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #8 +#endif +#else + ubfx r7, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #8 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #16 +#endif +#else + ubfx lr, r9, #16, #8 +#endif + lsr r2, r8, #24 + ldrb r4, [r0, r4, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r8, #24 + lsr r5, r5, #24 +#else + uxtb r5, r8 +#endif +#else + ubfx r5, r8, #0, #8 +#endif + eor r4, r4, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r11, #16 + lsr r7, r7, #24 +#else + uxtb r7, r11, ror #8 +#endif +#else + ubfx r7, r11, #8, #8 +#endif + eor r4, r4, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #8 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #16 +#endif +#else + ubfx lr, r10, #16, #8 +#endif + eor r4, r4, r2, lsl #24 + lsr r2, r9, #24 + ldrb r5, [r0, r5, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r9, #24 + lsr r6, r6, #24 +#else + uxtb r6, r9 +#endif +#else + ubfx r6, r9, #0, #8 +#endif + eor r5, r5, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #16 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #8 +#endif +#else + ubfx r7, r8, #8, #8 +#endif + eor r5, r5, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #8 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #16 +#endif +#else + ubfx lr, r11, #16, #8 +#endif + eor r5, r5, r2, lsl #24 + lsr r2, r10, #24 + ldrb r6, [r0, r6, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + lsr r11, r11, #24 + eor r6, r6, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #24 + lsr r7, r7, #24 +#else + uxtb r7, r10 +#endif +#else + ubfx r7, r10, #0, #8 +#endif + eor r6, r6, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #16 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #8 +#endif +#else + ubfx lr, r9, #8, #8 +#endif + eor r6, r6, r2, lsl #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #8 + lsr r2, r2, #24 +#else + uxtb r2, r8, ror #16 +#endif +#else + ubfx r2, r8, #16, #8 +#endif + ldrb r11, [r0, r11, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + eor lr, lr, r11, lsl #16 + ldm r3, {r8, r9, r10, r11} + eor r7, r7, lr, lsl #8 + eor r7, r7, r2, lsl #16 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + pop {r1, r2, lr} + ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldr r8, [sp, #4] + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + ldm r8, {r4, r5, r6, r7} + subs r2, r2, #16 + add lr, lr, #16 + add r1, r1, #16 + bne L_AES_GCMSIV_ctr_base_loop_block_192 + b L_AES_GCMSIV_ctr_base_end +L_AES_GCMSIV_ctr_base_start_block_128: +L_AES_GCMSIV_ctr_base_loop_block_128: + push {r1, r2, lr} + ldr lr, [sp, #16] + rev r8, r4 + add r8, r8, #1 + rev r8, r8 + mov r9, r5 + mov r10, r6 + mov r11, r7 + stm lr, {r8, r9, r10, r11} + ldm r3!, {r8, r9, r10, r11} + # Round: 0 - XOR in key schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + mov r1, #4 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + bl AES_encrypt_block +#else +L_AES_GCMSIV_ctr_base_block_nr_128: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r9, #8 + lsr r4, r4, #24 +#else + uxtb r4, r9, ror #16 +#endif +#else + ubfx r4, r9, #16, #8 +#endif + lsr r7, r8, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #16 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #8 +#endif +#else + ubfx lr, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r11, #24 + lsr r2, r2, #24 +#else + uxtb r2, r11 +#endif +#else + ubfx r2, r11, #0, #8 +#endif + ldr r4, [r0, r4, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r10, #8 + lsr r5, r5, #24 +#else + uxtb r5, r10, ror #16 +#endif +#else + ubfx r5, r10, #16, #8 +#endif + eor r4, r4, r7, ror #24 + lsr r7, r9, #24 + eor r4, r4, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #16 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #8 +#endif +#else + ubfx lr, r11, #8, #8 +#endif + eor r4, r4, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #24 + lsr r2, r2, #24 +#else + uxtb r2, r8 +#endif +#else + ubfx r2, r8, #0, #8 +#endif + ldr r5, [r0, r5, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r11, #8 + lsr r6, r6, #24 +#else + uxtb r6, r11, ror #16 +#endif +#else + ubfx r6, r11, #16, #8 +#endif + eor r5, r5, r7, ror #24 + lsr r7, r10, #24 + eor r5, r5, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r8, #16 + lsr lr, lr, #24 +#else + uxtb lr, r8, ror #8 +#endif +#else + ubfx lr, r8, #8, #8 +#endif + eor r5, r5, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #24 + lsr r2, r2, #24 +#else + uxtb r2, r9 +#endif +#else + ubfx r2, r9, #0, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r10, #24 + lsr r10, r10, #24 +#else + uxtb r10, r10 +#endif +#else + ubfx r10, r10, #0, #8 +#endif + eor r6, r6, r7, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #8 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #16 +#endif +#else + ubfx r7, r8, #16, #8 +#endif + eor r6, r6, lr, ror #8 + lsr lr, r11, #24 + eor r6, r6, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #16 + lsr r2, r2, #24 +#else + uxtb r2, r9, ror #8 +#endif +#else + ubfx r2, r9, #8, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r10, ror #24 + ldm r3!, {r8, r9, r10, r11} + eor r7, r7, lr, ror #24 + eor r7, r7, r2, ror #8 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + subs r1, r1, #1 + bne L_AES_GCMSIV_ctr_base_block_nr_128 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else + ubfx r8, r5, #16, #8 +#endif + lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else + ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else + ubfx r2, r7, #0, #8 +#endif + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else + ubfx r9, r6, #16, #8 +#endif + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else + ubfx lr, r7, #8, #8 +#endif + eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else + ubfx r2, r4, #0, #8 +#endif + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else + ubfx r10, r7, #16, #8 +#endif + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else + ubfx lr, r4, #8, #8 +#endif + eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else + ubfx r2, r5, #0, #8 +#endif + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else + ubfx r6, r6, #0, #8 +#endif + eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else + ubfx r11, r4, #16, #8 +#endif + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else + ubfx r2, r5, #8, #8 +#endif + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r11, #24 + lsr r4, r4, #24 +#else + uxtb r4, r11 +#endif +#else + ubfx r4, r11, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #16 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #8 +#endif +#else + ubfx r7, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #8 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #16 +#endif +#else + ubfx lr, r9, #16, #8 +#endif + lsr r2, r8, #24 + ldrb r4, [r0, r4, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r8, #24 + lsr r5, r5, #24 +#else + uxtb r5, r8 +#endif +#else + ubfx r5, r8, #0, #8 +#endif + eor r4, r4, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r11, #16 + lsr r7, r7, #24 +#else + uxtb r7, r11, ror #8 +#endif +#else + ubfx r7, r11, #8, #8 +#endif + eor r4, r4, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #8 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #16 +#endif +#else + ubfx lr, r10, #16, #8 +#endif + eor r4, r4, r2, lsl #24 + lsr r2, r9, #24 + ldrb r5, [r0, r5, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r9, #24 + lsr r6, r6, #24 +#else + uxtb r6, r9 +#endif +#else + ubfx r6, r9, #0, #8 +#endif + eor r5, r5, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #16 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #8 +#endif +#else + ubfx r7, r8, #8, #8 +#endif + eor r5, r5, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #8 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #16 +#endif +#else + ubfx lr, r11, #16, #8 +#endif + eor r5, r5, r2, lsl #24 + lsr r2, r10, #24 + ldrb r6, [r0, r6, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + lsr r11, r11, #24 + eor r6, r6, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #24 + lsr r7, r7, #24 +#else + uxtb r7, r10 +#endif +#else + ubfx r7, r10, #0, #8 +#endif + eor r6, r6, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #16 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #8 +#endif +#else + ubfx lr, r9, #8, #8 +#endif + eor r6, r6, r2, lsl #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #8 + lsr r2, r2, #24 +#else + uxtb r2, r8, ror #16 +#endif +#else + ubfx r2, r8, #16, #8 +#endif + ldrb r11, [r0, r11, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + eor lr, lr, r11, lsl #16 + ldm r3, {r8, r9, r10, r11} + eor r7, r7, lr, lsl #8 + eor r7, r7, r2, lsl #16 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + pop {r1, r2, lr} + ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldr r8, [sp, #4] + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + ldm r8, {r4, r5, r6, r7} + subs r2, r2, #16 + add lr, lr, #16 + add r1, r1, #16 + bne L_AES_GCMSIV_ctr_base_loop_block_128 +L_AES_GCMSIV_ctr_base_end: + pop {r3, r8} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + stm r8, {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size AES_GCMSIV_ctr_base,.-AES_GCMSIV_ctr_base +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #endif /* !NO_AES */ #endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index 52adcfc5f2..dcf8a361b4 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -8460,6 +8460,384 @@ WC_OMIT_FRAME_POINTER void AES_XTS_decrypt_AARCH32(const byte* in, byte* out, #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_crypto(unsigned char* s_p, + const unsigned char* h_p, const unsigned char* data_p, + unsigned int blocks_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_crypto(unsigned char* s, + const unsigned char* h, const unsigned char* data, unsigned int blocks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register unsigned char* s __asm__ ("r0") = (unsigned char*)s_p; + register const unsigned char* h __asm__ ("r1") = (const unsigned char*)h_p; + register const unsigned char* data __asm__ ("r2") = + (const unsigned char*)data_p; + register unsigned int blocks __asm__ ("r3") = (unsigned int)blocks_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "veor.8 q2, q2, q2\n\t" + "vld1.8 {q1}, [%[h]]\n\t" + "vld1.8 {q0}, [%[s]]\n\t" + "vrev64.8 q0, q0\n\t" + "vext.8 q0, q0, q0, #8\n\t" + "vmull.p64 q7, d2, d2\n\t" + "vmull.p64 q11, d2, d3\n\t" + "vmull.p64 q12, d3, d2\n\t" + "vmull.p64 q8, d3, d3\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "vshr.u32 q6, q7, #31\n\t" + "vshr.u32 q9, q8, #31\n\t" + "vshl.i32 q7, q7, #1\n\t" + "vshl.i32 q8, q8, #1\n\t" + "vext.8 q10, q6, q2, #12\n\t" + "vext.8 q9, q2, q9, #12\n\t" + "vext.8 q6, q2, q6, #12\n\t" + "veor.8 q7, q7, q6\n\t" + "veor.8 q8, q8, q9\n\t" + "veor.8 q8, q8, q10\n\t" + "vshl.i32 q6, q7, #31\n\t" + "vshl.i32 q9, q7, #30\n\t" + "vshl.i32 q10, q7, #25\n\t" + "veor.8 q6, q6, q9\n\t" + "veor.8 q6, q6, q10\n\t" + "vext.8 q9, q6, q2, #4\n\t" + "vext.8 q6, q2, q6, #4\n\t" + "veor.8 q7, q7, q6\n\t" + "vshr.u32 q11, q7, #1\n\t" + "vshr.u32 q12, q7, #2\n\t" + "vshr.u32 q13, q7, #7\n\t" + "veor.8 q11, q11, q12\n\t" + "veor.8 q11, q11, q13\n\t" + "veor.8 q11, q11, q9\n\t" + "veor.8 q7, q7, q11\n\t" + "veor.8 q3, q8, q7\n\t" + "vmull.p64 q7, d6, d2\n\t" + "vmull.p64 q11, d6, d3\n\t" + "vmull.p64 q12, d7, d2\n\t" + "vmull.p64 q8, d7, d3\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "vshr.u32 q6, q7, #31\n\t" + "vshr.u32 q9, q8, #31\n\t" + "vshl.i32 q7, q7, #1\n\t" + "vshl.i32 q8, q8, #1\n\t" + "vext.8 q10, q6, q2, #12\n\t" + "vext.8 q9, q2, q9, #12\n\t" + "vext.8 q6, q2, q6, #12\n\t" + "veor.8 q7, q7, q6\n\t" + "veor.8 q8, q8, q9\n\t" + "veor.8 q8, q8, q10\n\t" + "vshl.i32 q6, q7, #31\n\t" + "vshl.i32 q9, q7, #30\n\t" + "vshl.i32 q10, q7, #25\n\t" + "veor.8 q6, q6, q9\n\t" + "veor.8 q6, q6, q10\n\t" + "vext.8 q9, q6, q2, #4\n\t" + "vext.8 q6, q2, q6, #4\n\t" + "veor.8 q7, q7, q6\n\t" + "vshr.u32 q11, q7, #1\n\t" + "vshr.u32 q12, q7, #2\n\t" + "vshr.u32 q13, q7, #7\n\t" + "veor.8 q11, q11, q12\n\t" + "veor.8 q11, q11, q13\n\t" + "veor.8 q11, q11, q9\n\t" + "veor.8 q7, q7, q11\n\t" + "veor.8 q4, q8, q7\n\t" + "vmull.p64 q7, d6, d6\n\t" + "vmull.p64 q11, d6, d7\n\t" + "vmull.p64 q12, d7, d6\n\t" + "vmull.p64 q8, d7, d7\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "vshr.u32 q6, q7, #31\n\t" + "vshr.u32 q9, q8, #31\n\t" + "vshl.i32 q7, q7, #1\n\t" + "vshl.i32 q8, q8, #1\n\t" + "vext.8 q10, q6, q2, #12\n\t" + "vext.8 q9, q2, q9, #12\n\t" + "vext.8 q6, q2, q6, #12\n\t" + "veor.8 q7, q7, q6\n\t" + "veor.8 q8, q8, q9\n\t" + "veor.8 q8, q8, q10\n\t" + "vshl.i32 q6, q7, #31\n\t" + "vshl.i32 q9, q7, #30\n\t" + "vshl.i32 q10, q7, #25\n\t" + "veor.8 q6, q6, q9\n\t" + "veor.8 q6, q6, q10\n\t" + "vext.8 q9, q6, q2, #4\n\t" + "vext.8 q6, q2, q6, #4\n\t" + "veor.8 q7, q7, q6\n\t" + "vshr.u32 q11, q7, #1\n\t" + "vshr.u32 q12, q7, #2\n\t" + "vshr.u32 q13, q7, #7\n\t" + "veor.8 q11, q11, q12\n\t" + "veor.8 q11, q11, q13\n\t" + "veor.8 q11, q11, q9\n\t" + "veor.8 q7, q7, q11\n\t" + "veor.8 q5, q8, q7\n\t" + "lsr r12, %[blocks], #2\n\t" + "cmp r12, #0\n\t" + "beq L_AES_GCMSIV_polyval_crypto_rem_start_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_crypto_group_%=:\n\t" + "vld1.8 {q6}, [%[data]]!\n\t" + "veor.8 q6, q6, q0\n\t" + "vmull.p64 q9, d12, d10\n\t" + "vmull.p64 q11, d12, d11\n\t" + "vmull.p64 q12, d13, d10\n\t" + "vmull.p64 q10, d13, d11\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q9, q9, q12\n\t" + "veor.8 q10, q10, q11\n\t" + "vld1.8 {q6}, [%[data]]!\n\t" + "vmull.p64 q7, d12, d8\n\t" + "vmull.p64 q11, d12, d9\n\t" + "vmull.p64 q12, d13, d8\n\t" + "vmull.p64 q8, d13, d9\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "veor.8 q9, q9, q7\n\t" + "veor.8 q10, q10, q8\n\t" + "vld1.8 {q6}, [%[data]]!\n\t" + "vmull.p64 q7, d12, d6\n\t" + "vmull.p64 q11, d12, d7\n\t" + "vmull.p64 q12, d13, d6\n\t" + "vmull.p64 q8, d13, d7\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "veor.8 q9, q9, q7\n\t" + "veor.8 q10, q10, q8\n\t" + "vld1.8 {q6}, [%[data]]!\n\t" + "vmull.p64 q7, d12, d2\n\t" + "vmull.p64 q11, d12, d3\n\t" + "vmull.p64 q12, d13, d2\n\t" + "vmull.p64 q8, d13, d3\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q7, q7, q12\n\t" + "veor.8 q8, q8, q11\n\t" + "veor.8 q9, q9, q7\n\t" + "veor.8 q10, q10, q8\n\t" + "vshr.u32 q6, q9, #31\n\t" + "vshr.u32 q7, q10, #31\n\t" + "vshl.i32 q9, q9, #1\n\t" + "vshl.i32 q10, q10, #1\n\t" + "vext.8 q8, q6, q2, #12\n\t" + "vext.8 q7, q2, q7, #12\n\t" + "vext.8 q6, q2, q6, #12\n\t" + "veor.8 q9, q9, q6\n\t" + "veor.8 q10, q10, q7\n\t" + "veor.8 q10, q10, q8\n\t" + "vshl.i32 q6, q9, #31\n\t" + "vshl.i32 q7, q9, #30\n\t" + "vshl.i32 q8, q9, #25\n\t" + "veor.8 q6, q6, q7\n\t" + "veor.8 q6, q6, q8\n\t" + "vext.8 q7, q6, q2, #4\n\t" + "vext.8 q6, q2, q6, #4\n\t" + "veor.8 q9, q9, q6\n\t" + "vshr.u32 q11, q9, #1\n\t" + "vshr.u32 q12, q9, #2\n\t" + "vshr.u32 q13, q9, #7\n\t" + "veor.8 q11, q11, q12\n\t" + "veor.8 q11, q11, q13\n\t" + "veor.8 q11, q11, q7\n\t" + "veor.8 q9, q9, q11\n\t" + "veor.8 q0, q10, q9\n\t" + "subs r12, r12, #1\n\t" + "bne L_AES_GCMSIV_polyval_crypto_group_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_crypto_rem_start_%=:\n\t" + "and %[blocks], %[blocks], #3\n\t" + "cmp %[blocks], #0\n\t" + "beq L_AES_GCMSIV_polyval_crypto_done_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_crypto_rem_%=:\n\t" + "vld1.8 {q6}, [%[data]]!\n\t" + "veor.8 q0, q0, q6\n\t" + "vmull.p64 q9, d0, d2\n\t" + "vmull.p64 q11, d0, d3\n\t" + "vmull.p64 q12, d1, d2\n\t" + "vmull.p64 q10, d1, d3\n\t" + "veor.8 q11, q11, q12\n\t" + "vext.8 q12, q2, q11, #8\n\t" + "vext.8 q11, q11, q2, #8\n\t" + "veor.8 q9, q9, q12\n\t" + "veor.8 q10, q10, q11\n\t" + "vshr.u32 q6, q9, #31\n\t" + "vshr.u32 q7, q10, #31\n\t" + "vshl.i32 q9, q9, #1\n\t" + "vshl.i32 q10, q10, #1\n\t" + "vext.8 q8, q6, q2, #12\n\t" + "vext.8 q7, q2, q7, #12\n\t" + "vext.8 q6, q2, q6, #12\n\t" + "veor.8 q9, q9, q6\n\t" + "veor.8 q10, q10, q7\n\t" + "veor.8 q10, q10, q8\n\t" + "vshl.i32 q6, q9, #31\n\t" + "vshl.i32 q7, q9, #30\n\t" + "vshl.i32 q8, q9, #25\n\t" + "veor.8 q6, q6, q7\n\t" + "veor.8 q6, q6, q8\n\t" + "vext.8 q7, q6, q2, #4\n\t" + "vext.8 q6, q2, q6, #4\n\t" + "veor.8 q9, q9, q6\n\t" + "vshr.u32 q11, q9, #1\n\t" + "vshr.u32 q12, q9, #2\n\t" + "vshr.u32 q13, q9, #7\n\t" + "veor.8 q11, q11, q12\n\t" + "veor.8 q11, q11, q13\n\t" + "veor.8 q11, q11, q7\n\t" + "veor.8 q9, q9, q11\n\t" + "veor.8 q0, q10, q9\n\t" + "subs %[blocks], %[blocks], #1\n\t" + "bne L_AES_GCMSIV_polyval_crypto_rem_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_crypto_done_%=:\n\t" + "vrev64.8 q0, q0\n\t" + "vext.8 q0, q0, q0, #8\n\t" + "vst1.8 {q0}, [%[s]]\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [s] "+r" (s), [h] "+r" (h), [data] "+r" (data), + [blocks] "+r" (blocks) + : +#else + : + : [s] "r" (s), [h] "r" (h), [data] "r" (data), [blocks] "r" (blocks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_crypto(const unsigned char* in_p, + unsigned char* out_p, unsigned long length_p, const unsigned char* KS_p, + int nr_p, unsigned char* ctr_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_crypto(const unsigned char* in, + unsigned char* out, unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = + (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long length __asm__ ("r2") = (unsigned long)length_p; + register const unsigned char* KS __asm__ ("r3") = + (const unsigned char*)KS_p; + register int nr __asm__ ("r12") = (int)nr_p; + register unsigned char* ctr __asm__ ("lr") = (unsigned char*)ctr_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "push {%[nr], %[ctr]}\n\t" + "ldr r12, [sp]\n\t" + "ldr lr, [sp, #4]\n\t" + "vld1.8 {q0}, [lr]\n\t" + "lsr r5, %[length], #4\n\t" + "vmov r7, r8, d0\n\t" + "vmov r9, r10, d1\n\t" + "\n" + "L_AES_GCMSIV_ctr_crypto_loop2_%=:\n\t" + "cmp r5, #2\n\t" + "blt L_AES_GCMSIV_ctr_crypto_tail_%=\n\t" + "vmov d2, r7, r8\n\t" + "vmov d3, r9, r10\n\t" + "add %[length], r7, #1\n\t" + "vmov d4, %[length], r8\n\t" + "vmov d5, r9, r10\n\t" + "add r7, r7, #2\n\t" + "mov r4, %[KS]\n\t" + "sub r6, r12, #1\n\t" + "\n" + "L_AES_GCMSIV_ctr_crypto_rounds2_%=:\n\t" + "vld1.32 {q5}, [r4]!\n\t" + "aese.8 q1, q5\n\t" + "aesmc.8 q1, q1\n\t" + "aese.8 q2, q5\n\t" + "aesmc.8 q2, q2\n\t" + "subs r6, r6, #1\n\t" + "bne L_AES_GCMSIV_ctr_crypto_rounds2_%=\n\t" + "vld1.32 {q5}, [r4]!\n\t" + "aese.8 q1, q5\n\t" + "aese.8 q2, q5\n\t" + "vld1.32 {q5}, [r4]\n\t" + "veor.8 q1, q1, q5\n\t" + "veor.8 q2, q2, q5\n\t" + "vld1.8 {q3-q4}, [%[in]]!\n\t" + "veor.8 q3, q3, q1\n\t" + "veor.8 q4, q4, q2\n\t" + "vst1.8 {q3-q4}, [%[out]]!\n\t" + "sub r5, r5, #2\n\t" + "b L_AES_GCMSIV_ctr_crypto_loop2_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_crypto_tail_%=:\n\t" + "cmp r5, #0\n\t" + "beq L_AES_GCMSIV_ctr_crypto_done_%=\n\t" + "vmov d2, r7, r8\n\t" + "vmov d3, r9, r10\n\t" + "add r7, r7, #1\n\t" + "mov r4, %[KS]\n\t" + "sub r6, r12, #1\n\t" + "\n" + "L_AES_GCMSIV_ctr_crypto_rounds1_%=:\n\t" + "vld1.32 {q5}, [r4]!\n\t" + "aese.8 q1, q5\n\t" + "aesmc.8 q1, q1\n\t" + "subs r6, r6, #1\n\t" + "bne L_AES_GCMSIV_ctr_crypto_rounds1_%=\n\t" + "vld1.32 {q5}, [r4]!\n\t" + "aese.8 q1, q5\n\t" + "vld1.32 {q5}, [r4]\n\t" + "veor.8 q1, q1, q5\n\t" + "vld1.8 {q3}, [%[in]]!\n\t" + "veor.8 q3, q3, q1\n\t" + "vst1.8 {q3}, [%[out]]!\n\t" + "\n" + "L_AES_GCMSIV_ctr_crypto_done_%=:\n\t" + "vmov d0, r7, r8\n\t" + "vst1.8 {q0}, [lr]\n\t" + "pop {%[nr], %[ctr]}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [length] "+r" (length), + [KS] "+r" (KS), [nr] "+r" (nr), [ctr] "+r" (ctr) + : +#else + : + : [in] "r" (in), [out] "r" (out), [length] "r" (length), [KS] "r" (KS), + [nr] "r" (nr), [ctr] "r" (ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "q0", "q1", + "q2", "q3", "q4", "q5" + ); +} + +#endif /* WOLFSSL_AESGCM_SIV */ #else #ifdef HAVE_AES_DECRYPT XALIGNED(8) static const word32 L_AES_ARM32_td_data[] = { @@ -26895,6 +27273,3175 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in, } #endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AESGCM_SIV +XALIGNED(8) static const word32 L_AES_GCMSIV_polyval_base_r[] = { + 0x00000000, 0x1c200000, 0x38400000, 0x24600000, + 0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000, + 0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000, + 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_base(unsigned char* s_p, + const unsigned char* m_p, const unsigned char* data_p, + unsigned int blocks_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_base(unsigned char* s, + const unsigned char* m, const unsigned char* data, unsigned int blocks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register unsigned char* s __asm__ ("r0") = (unsigned char*)s_p; + register const unsigned char* m __asm__ ("r1") = (const unsigned char*)m_p; + register const unsigned char* data __asm__ ("r2") = + (const unsigned char*)data_p; + register unsigned int blocks __asm__ ("r3") = (unsigned int)blocks_p; + register word32* L_AES_ARM32_te_gcm_c __asm__ ("r12") = + (word32*)L_AES_ARM32_te_gcm; + register word32* L_AES_GCMSIV_polyval_base_r_c __asm__ ("lr") = + (word32*)&L_AES_GCMSIV_polyval_base_r; +#else + register word32* L_AES_ARM32_te_gcm_c = (word32*)L_AES_ARM32_te_gcm; + register word32* L_AES_GCMSIV_polyval_base_r_c = + (word32*)&L_AES_GCMSIV_polyval_base_r; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "push {%[L_AES_ARM32_te_gcm], %[L_AES_GCMSIV_polyval_base_r]}\n\t" + "mov r6, %[L_AES_GCMSIV_polyval_base_r]\n\t" + "cmp %[blocks], #0\n\t" + "beq L_AES_GCMSIV_polyval_base_done_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_base_loop_%=:\n\t" + "ldr r10, [%[data], #12]\n\t" + "rev r10, r10\n\t" + "ldr r8, [%[s]]\n\t" + "eor r8, r8, r10\n\t" + "str r8, [%[s]]\n\t" + "ldr r10, [%[data], #8]\n\t" + "rev r10, r10\n\t" + "ldr r8, [%[s], #4]\n\t" + "eor r8, r8, r10\n\t" + "str r8, [%[s], #4]\n\t" + "ldr r10, [%[data], #4]\n\t" + "rev r10, r10\n\t" + "ldr r8, [%[s], #8]\n\t" + "eor r8, r8, r10\n\t" + "str r8, [%[s], #8]\n\t" + "ldr r10, [%[data]]\n\t" + "rev r10, r10\n\t" + "ldr r8, [%[s], #12]\n\t" + "eor r8, r8, r10\n\t" + "str r8, [%[s], #12]\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "ldr r7, [%[s], #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #4\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #24, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsr r8, r7, #28\n\t" +#else + "ubfx r8, r7, #28, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #12\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #16, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #20, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #20\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #8, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #16\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #12, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #28\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #0, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #24\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #4, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" + "ldr r7, [%[s], #8]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #4\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #24, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsr r8, r7, #28\n\t" +#else + "ubfx r8, r7, #28, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #12\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #16, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #20, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #20\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #8, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #16\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #12, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #28\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #0, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #24\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #4, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" + "ldr r7, [%[s], #4]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #4\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #24, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsr r8, r7, #28\n\t" +#else + "ubfx r8, r7, #28, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #12\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #16, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #20, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #20\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #8, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #16\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #12, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #28\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #0, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #24\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #4, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" + "ldr r7, [%[s]]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #4\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #24, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsr r8, r7, #28\n\t" +#else + "ubfx r8, r7, #28, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #12\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #16, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #20, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #20\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #8, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #16\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #12, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #28\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #0, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "and r8, r4, #15\n\t" + "lsr r4, r4, #4\n\t" + "orr r4, r4, r5, lsl #28\n\t" + "lsr r5, r5, #4\n\t" + "orr r5, r5, r12, lsl #28\n\t" + "lsr r12, r12, #4\n\t" + "orr r12, r12, lr, lsl #28\n\t" + "lsr lr, lr, #4\n\t" + "ldr r10, [r6, r8, lsl #2]\n\t" + "eor lr, lr, r10\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r8, r7, #24\n\t" + "lsr r8, r8, #28\n\t" +#else + "ubfx r8, r7, #4, #4\n\t" +#endif + "add r9, %[m], r8, lsl #4\n\t" + "ldr r10, [r9]\n\t" + "eor r12, r12, r10\n\t" + "ldr r10, [r9, #4]\n\t" + "eor lr, lr, r10\n\t" + "ldr r10, [r9, #8]\n\t" + "eor r4, r4, r10\n\t" + "ldr r10, [r9, #12]\n\t" + "eor r5, r5, r10\n\t" + "rev lr, lr\n\t" + "rev r12, r12\n\t" + "rev r5, r5\n\t" + "rev r4, r4\n\t" + "str lr, [%[s]]\n\t" + "str r12, [%[s], #4]\n\t" + "str r5, [%[s], #8]\n\t" + "str r4, [%[s], #12]\n\t" + "subs %[blocks], %[blocks], #1\n\t" + "add %[data], %[data], #16\n\t" + "bne L_AES_GCMSIV_polyval_base_loop_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_base_done_%=:\n\t" + "pop {%[L_AES_ARM32_te_gcm], %[L_AES_GCMSIV_polyval_base_r]}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [s] "+r" (s), [m] "+r" (m), [data] "+r" (data), + [blocks] "+r" (blocks), + [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c), + [L_AES_GCMSIV_polyval_base_r] "+r" (L_AES_GCMSIV_polyval_base_r_c) + : +#else + : + : [s] "r" (s), [m] "r" (m), [data] "r" (data), [blocks] "r" (blocks), + [L_AES_ARM32_te_gcm] "r" (L_AES_ARM32_te_gcm_c), + [L_AES_GCMSIV_polyval_base_r] "r" (L_AES_GCMSIV_polyval_base_r_c) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +XALIGNED(8) static const word32 L_AES_GCMSIV_ctr_base_te_data[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +static const word32* L_AES_GCMSIV_ctr_base_te = L_AES_GCMSIV_ctr_base_te_data; +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_base(const unsigned char* in_p, + unsigned char* out_p, unsigned long length_p, const unsigned char* KS_p, + int nr_p, unsigned char* ctr_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_base(const unsigned char* in, + unsigned char* out, unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = + (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long length __asm__ ("r2") = (unsigned long)length_p; + register const unsigned char* KS __asm__ ("r3") = + (const unsigned char*)KS_p; + register int nr __asm__ ("r12") = (int)nr_p; + register unsigned char* ctr __asm__ ("lr") = (unsigned char*)ctr_p; + register word32* L_AES_GCMSIV_ctr_base_te_c __asm__ ("r4") = + (word32*)L_AES_GCMSIV_ctr_base_te; +#else + register word32* L_AES_GCMSIV_ctr_base_te_c = + (word32*)L_AES_GCMSIV_ctr_base_te; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "push {%[L_AES_GCMSIV_ctr_base_te]}\n\t" + "push {%[nr], %[ctr]}\n\t" + "ldr r12, [sp]\n\t" + "ldr r8, [sp, #4]\n\t" + "mov lr, %[in]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "stm r8, {r4, r5, r6, r7}\n\t" + "push {%[KS], r8}\n\t" + "cmp r12, #10\n\t" + "beq L_AES_GCMSIV_ctr_base_start_block_128_%=\n\t" + "cmp r12, #12\n\t" + "beq L_AES_GCMSIV_ctr_base_start_block_192_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_loop_block_256_%=:\n\t" + "push {r1, %[length], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r8, r4\n\t" + "add r8, r8, #1\n\t" + "rev r8, r8\n\t" + "mov r9, r5\n\t" + "mov r10, r6\n\t" + "mov r11, r7\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "bl AES_encrypt_block\n\t" +#else + "\n" + "L_AES_GCMSIV_ctr_base_block_nr_256_%=:\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r9, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r9, ror #16\n\t" +#endif +#else + "ubfx r4, r9, #16, #8\n\t" +#endif + "lsr r7, r8, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #8\n\t" +#endif +#else + "ubfx lr, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r11, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r11\n\t" +#endif +#else + "ubfx r2, r11, #0, #8\n\t" +#endif + "ldr r4, [r0, r4, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r10, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r10, ror #16\n\t" +#endif +#else + "ubfx r5, r10, #16, #8\n\t" +#endif + "eor r4, r4, r7, ror #24\n\t" + "lsr r7, r9, #24\n\t" + "eor r4, r4, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #8\n\t" +#endif +#else + "ubfx lr, r11, #8, #8\n\t" +#endif + "eor r4, r4, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8\n\t" +#endif +#else + "ubfx r2, r8, #0, #8\n\t" +#endif + "ldr r5, [r0, r5, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r11, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r11, ror #16\n\t" +#endif +#else + "ubfx r6, r11, #16, #8\n\t" +#endif + "eor r5, r5, r7, ror #24\n\t" + "lsr r7, r10, #24\n\t" + "eor r5, r5, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r8, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r8, ror #8\n\t" +#endif +#else + "ubfx lr, r8, #8, #8\n\t" +#endif + "eor r5, r5, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9\n\t" +#endif +#else + "ubfx r2, r9, #0, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r10, #24\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r10\n\t" +#endif +#else + "ubfx r10, r10, #0, #8\n\t" +#endif + "eor r6, r6, r7, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #8\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #16\n\t" +#endif +#else + "ubfx r7, r8, #16, #8\n\t" +#endif + "eor r6, r6, lr, ror #8\n\t" + "lsr lr, r11, #24\n\t" + "eor r6, r6, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9, ror #8\n\t" +#endif +#else + "ubfx r2, r9, #8, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r10, ror #24\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, ror #24\n\t" + "eor r7, r7, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "subs r1, r1, #1\n\t" + "bne L_AES_GCMSIV_ctr_base_block_nr_256_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r11, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r11\n\t" +#endif +#else + "ubfx r4, r11, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #8\n\t" +#endif +#else + "ubfx r7, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #16\n\t" +#endif +#else + "ubfx lr, r9, #16, #8\n\t" +#endif + "lsr r2, r8, #24\n\t" + "ldrb r4, [r0, r4, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r8, #24\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r8\n\t" +#endif +#else + "ubfx r5, r8, #0, #8\n\t" +#endif + "eor r4, r4, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r11, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r11, ror #8\n\t" +#endif +#else + "ubfx r7, r11, #8, #8\n\t" +#endif + "eor r4, r4, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #16\n\t" +#endif +#else + "ubfx lr, r10, #16, #8\n\t" +#endif + "eor r4, r4, r2, lsl #24\n\t" + "lsr r2, r9, #24\n\t" + "ldrb r5, [r0, r5, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r9, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r9\n\t" +#endif +#else + "ubfx r6, r9, #0, #8\n\t" +#endif + "eor r5, r5, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #8\n\t" +#endif +#else + "ubfx r7, r8, #8, #8\n\t" +#endif + "eor r5, r5, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #16\n\t" +#endif +#else + "ubfx lr, r11, #16, #8\n\t" +#endif + "eor r5, r5, r2, lsl #24\n\t" + "lsr r2, r10, #24\n\t" + "ldrb r6, [r0, r6, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "lsr r11, r11, #24\n\t" + "eor r6, r6, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #24\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10\n\t" +#endif +#else + "ubfx r7, r10, #0, #8\n\t" +#endif + "eor r6, r6, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #8\n\t" +#endif +#else + "ubfx lr, r9, #8, #8\n\t" +#endif + "eor r6, r6, r2, lsl #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #8\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8, ror #16\n\t" +#endif +#else + "ubfx r2, r8, #16, #8\n\t" +#endif + "ldrb r11, [r0, r11, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r11, lsl #16\n\t" + "ldm %[KS], {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, lsl #8\n\t" + "eor r7, r7, r2, lsl #16\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "pop {r1, %[length], lr}\n\t" + "ldr %[KS], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[length], %[length], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCMSIV_ctr_base_loop_block_256_%=\n\t" + "b L_AES_GCMSIV_ctr_base_end_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_start_block_192_%=:\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_loop_block_192_%=:\n\t" + "push {r1, %[length], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r8, r4\n\t" + "add r8, r8, #1\n\t" + "rev r8, r8\n\t" + "mov r9, r5\n\t" + "mov r10, r6\n\t" + "mov r11, r7\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "bl AES_encrypt_block\n\t" +#else + "\n" + "L_AES_GCMSIV_ctr_base_block_nr_192_%=:\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r9, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r9, ror #16\n\t" +#endif +#else + "ubfx r4, r9, #16, #8\n\t" +#endif + "lsr r7, r8, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #8\n\t" +#endif +#else + "ubfx lr, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r11, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r11\n\t" +#endif +#else + "ubfx r2, r11, #0, #8\n\t" +#endif + "ldr r4, [r0, r4, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r10, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r10, ror #16\n\t" +#endif +#else + "ubfx r5, r10, #16, #8\n\t" +#endif + "eor r4, r4, r7, ror #24\n\t" + "lsr r7, r9, #24\n\t" + "eor r4, r4, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #8\n\t" +#endif +#else + "ubfx lr, r11, #8, #8\n\t" +#endif + "eor r4, r4, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8\n\t" +#endif +#else + "ubfx r2, r8, #0, #8\n\t" +#endif + "ldr r5, [r0, r5, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r11, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r11, ror #16\n\t" +#endif +#else + "ubfx r6, r11, #16, #8\n\t" +#endif + "eor r5, r5, r7, ror #24\n\t" + "lsr r7, r10, #24\n\t" + "eor r5, r5, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r8, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r8, ror #8\n\t" +#endif +#else + "ubfx lr, r8, #8, #8\n\t" +#endif + "eor r5, r5, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9\n\t" +#endif +#else + "ubfx r2, r9, #0, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r10, #24\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r10\n\t" +#endif +#else + "ubfx r10, r10, #0, #8\n\t" +#endif + "eor r6, r6, r7, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #8\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #16\n\t" +#endif +#else + "ubfx r7, r8, #16, #8\n\t" +#endif + "eor r6, r6, lr, ror #8\n\t" + "lsr lr, r11, #24\n\t" + "eor r6, r6, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9, ror #8\n\t" +#endif +#else + "ubfx r2, r9, #8, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r10, ror #24\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, ror #24\n\t" + "eor r7, r7, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "subs r1, r1, #1\n\t" + "bne L_AES_GCMSIV_ctr_base_block_nr_192_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r11, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r11\n\t" +#endif +#else + "ubfx r4, r11, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #8\n\t" +#endif +#else + "ubfx r7, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #16\n\t" +#endif +#else + "ubfx lr, r9, #16, #8\n\t" +#endif + "lsr r2, r8, #24\n\t" + "ldrb r4, [r0, r4, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r8, #24\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r8\n\t" +#endif +#else + "ubfx r5, r8, #0, #8\n\t" +#endif + "eor r4, r4, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r11, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r11, ror #8\n\t" +#endif +#else + "ubfx r7, r11, #8, #8\n\t" +#endif + "eor r4, r4, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #16\n\t" +#endif +#else + "ubfx lr, r10, #16, #8\n\t" +#endif + "eor r4, r4, r2, lsl #24\n\t" + "lsr r2, r9, #24\n\t" + "ldrb r5, [r0, r5, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r9, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r9\n\t" +#endif +#else + "ubfx r6, r9, #0, #8\n\t" +#endif + "eor r5, r5, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #8\n\t" +#endif +#else + "ubfx r7, r8, #8, #8\n\t" +#endif + "eor r5, r5, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #16\n\t" +#endif +#else + "ubfx lr, r11, #16, #8\n\t" +#endif + "eor r5, r5, r2, lsl #24\n\t" + "lsr r2, r10, #24\n\t" + "ldrb r6, [r0, r6, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "lsr r11, r11, #24\n\t" + "eor r6, r6, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #24\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10\n\t" +#endif +#else + "ubfx r7, r10, #0, #8\n\t" +#endif + "eor r6, r6, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #8\n\t" +#endif +#else + "ubfx lr, r9, #8, #8\n\t" +#endif + "eor r6, r6, r2, lsl #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #8\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8, ror #16\n\t" +#endif +#else + "ubfx r2, r8, #16, #8\n\t" +#endif + "ldrb r11, [r0, r11, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r11, lsl #16\n\t" + "ldm %[KS], {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, lsl #8\n\t" + "eor r7, r7, r2, lsl #16\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "pop {r1, %[length], lr}\n\t" + "ldr %[KS], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[length], %[length], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCMSIV_ctr_base_loop_block_192_%=\n\t" + "b L_AES_GCMSIV_ctr_base_end_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_start_block_128_%=:\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_loop_block_128_%=:\n\t" + "push {r1, %[length], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r8, r4\n\t" + "add r8, r8, #1\n\t" + "rev r8, r8\n\t" + "mov r9, r5\n\t" + "mov r10, r6\n\t" + "mov r11, r7\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "bl AES_encrypt_block\n\t" +#else + "\n" + "L_AES_GCMSIV_ctr_base_block_nr_128_%=:\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r9, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r9, ror #16\n\t" +#endif +#else + "ubfx r4, r9, #16, #8\n\t" +#endif + "lsr r7, r8, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #8\n\t" +#endif +#else + "ubfx lr, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r11, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r11\n\t" +#endif +#else + "ubfx r2, r11, #0, #8\n\t" +#endif + "ldr r4, [r0, r4, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r10, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r10, ror #16\n\t" +#endif +#else + "ubfx r5, r10, #16, #8\n\t" +#endif + "eor r4, r4, r7, ror #24\n\t" + "lsr r7, r9, #24\n\t" + "eor r4, r4, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #8\n\t" +#endif +#else + "ubfx lr, r11, #8, #8\n\t" +#endif + "eor r4, r4, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8\n\t" +#endif +#else + "ubfx r2, r8, #0, #8\n\t" +#endif + "ldr r5, [r0, r5, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r11, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r11, ror #16\n\t" +#endif +#else + "ubfx r6, r11, #16, #8\n\t" +#endif + "eor r5, r5, r7, ror #24\n\t" + "lsr r7, r10, #24\n\t" + "eor r5, r5, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r8, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r8, ror #8\n\t" +#endif +#else + "ubfx lr, r8, #8, #8\n\t" +#endif + "eor r5, r5, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9\n\t" +#endif +#else + "ubfx r2, r9, #0, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r10, #24\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r10\n\t" +#endif +#else + "ubfx r10, r10, #0, #8\n\t" +#endif + "eor r6, r6, r7, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #8\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #16\n\t" +#endif +#else + "ubfx r7, r8, #16, #8\n\t" +#endif + "eor r6, r6, lr, ror #8\n\t" + "lsr lr, r11, #24\n\t" + "eor r6, r6, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9, ror #8\n\t" +#endif +#else + "ubfx r2, r9, #8, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r7, [r0, r7, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r10, ror #24\n\t" + "ldm %[KS]!, {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, ror #24\n\t" + "eor r7, r7, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "subs r1, r1, #1\n\t" + "bne L_AES_GCMSIV_ctr_base_block_nr_128_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else + "ubfx r8, r5, #16, #8\n\t" +#endif + "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else + "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else + "ubfx r2, r7, #0, #8\n\t" +#endif + "ldr r8, [r0, r8, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else + "ubfx r9, r6, #16, #8\n\t" +#endif + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else + "ubfx lr, r7, #8, #8\n\t" +#endif + "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else + "ubfx r2, r4, #0, #8\n\t" +#endif + "ldr r9, [r0, r9, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else + "ubfx r10, r7, #16, #8\n\t" +#endif + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else + "ubfx lr, r4, #8, #8\n\t" +#endif + "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else + "ubfx r2, r5, #0, #8\n\t" +#endif + "ldr r10, [r0, r10, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else + "ubfx r6, r6, #0, #8\n\t" +#endif + "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else + "ubfx r11, r4, #16, #8\n\t" +#endif + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else + "ubfx r2, r5, #8, #8\n\t" +#endif + "ldr r6, [r0, r6, lsl #2]\n\t" + "ldr lr, [r0, lr, lsl #2]\n\t" + "ldr r11, [r0, r11, lsl #2]\n\t" + "ldr r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[KS]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r11, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r11\n\t" +#endif +#else + "ubfx r4, r11, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #8\n\t" +#endif +#else + "ubfx r7, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #16\n\t" +#endif +#else + "ubfx lr, r9, #16, #8\n\t" +#endif + "lsr r2, r8, #24\n\t" + "ldrb r4, [r0, r4, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r8, #24\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r8\n\t" +#endif +#else + "ubfx r5, r8, #0, #8\n\t" +#endif + "eor r4, r4, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r11, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r11, ror #8\n\t" +#endif +#else + "ubfx r7, r11, #8, #8\n\t" +#endif + "eor r4, r4, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #16\n\t" +#endif +#else + "ubfx lr, r10, #16, #8\n\t" +#endif + "eor r4, r4, r2, lsl #24\n\t" + "lsr r2, r9, #24\n\t" + "ldrb r5, [r0, r5, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r9, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r9\n\t" +#endif +#else + "ubfx r6, r9, #0, #8\n\t" +#endif + "eor r5, r5, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #8\n\t" +#endif +#else + "ubfx r7, r8, #8, #8\n\t" +#endif + "eor r5, r5, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #16\n\t" +#endif +#else + "ubfx lr, r11, #16, #8\n\t" +#endif + "eor r5, r5, r2, lsl #24\n\t" + "lsr r2, r10, #24\n\t" + "ldrb r6, [r0, r6, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "lsr r11, r11, #24\n\t" + "eor r6, r6, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #24\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10\n\t" +#endif +#else + "ubfx r7, r10, #0, #8\n\t" +#endif + "eor r6, r6, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #8\n\t" +#endif +#else + "ubfx lr, r9, #8, #8\n\t" +#endif + "eor r6, r6, r2, lsl #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #8\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8, ror #16\n\t" +#endif +#else + "ubfx r2, r8, #16, #8\n\t" +#endif + "ldrb r11, [r0, r11, lsl #2]\n\t" + "ldrb r7, [r0, r7, lsl #2]\n\t" + "ldrb lr, [r0, lr, lsl #2]\n\t" + "ldrb r2, [r0, r2, lsl #2]\n\t" + "eor lr, lr, r11, lsl #16\n\t" + "ldm %[KS], {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, lsl #8\n\t" + "eor r7, r7, r2, lsl #16\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "pop {r1, %[length], lr}\n\t" + "ldr %[KS], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[length], %[length], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCMSIV_ctr_base_loop_block_128_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_end_%=:\n\t" + "pop {%[KS], r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "stm r8, {r4, r5, r6, r7}\n\t" + "pop {%[nr], %[ctr]}\n\t" + "pop {%[L_AES_GCMSIV_ctr_base_te]}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [length] "+r" (length), + [KS] "+r" (KS), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_GCMSIV_ctr_base_te] "+r" (L_AES_GCMSIV_ctr_base_te_c) + : +#else + : + : [in] "r" (in), [out] "r" (out), [length] "r" (length), [KS] "r" (KS), + [nr] "r" (nr), [ctr] "r" (ctr), + [L_AES_GCMSIV_ctr_base_te] "r" (L_AES_GCMSIV_ctr_base_te_c) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #endif /* !NO_AES */ #endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S index 7a3d2ef74b..325d1cf975 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S @@ -5736,7 +5736,7 @@ L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done: eor x14, x14, x14 - ubfiz x24, x4, #3, #32 + lsl x24, x4, #3 mov v28.d[0], x14 mov v28.d[1], x24 rev64 v28.16b, v28.16b @@ -7099,10 +7099,10 @@ L_aes_gcm_encrypt_arm64_crypto_192_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_192_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -8637,10 +8637,10 @@ L_aes_gcm_encrypt_arm64_crypto_256_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_256_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 aese v14.16b, v0.16b @@ -9944,10 +9944,10 @@ L_aes_gcm_encrypt_arm64_crypto_128_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_128_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -10537,7 +10537,7 @@ L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes: # Done GHASH L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done: eor x14, x14, x14 - ubfiz x24, x4, #3, #32 + lsl x24, x4, #3 mov v28.d[0], x14 mov v28.d[1], x24 rev64 v28.16b, v28.16b @@ -11893,10 +11893,10 @@ L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_192_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -11945,7 +11945,6 @@ L_aes_gcm_decrypt_arm64_crypto_192_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded L_aes_gcm_decrypt_arm64_crypto_192_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -13447,10 +13446,10 @@ L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_256_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 aese v14.16b, v0.16b @@ -13507,7 +13506,6 @@ L_aes_gcm_decrypt_arm64_crypto_256_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded L_aes_gcm_decrypt_arm64_crypto_256_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -14770,10 +14768,10 @@ L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_128_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -14818,7 +14816,6 @@ L_aes_gcm_decrypt_arm64_crypto_128_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded L_aes_gcm_decrypt_arm64_crypto_128_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -15367,7 +15364,7 @@ L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done: eor x14, x14, x14 - ubfiz x24, x4, #3, #32 + lsl x24, x4, #3 mov v28.d[0], x14 mov v28.d[1], x24 rev64 v28.16b, v28.16b @@ -16701,10 +16698,10 @@ L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -18210,10 +18207,10 @@ L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 aese v14.16b, v0.16b @@ -19488,10 +19485,10 @@ L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero: # Done GHASH L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -20059,7 +20056,7 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes: # Done GHASH L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done: eor x14, x14, x14 - ubfiz x24, x4, #3, #32 + lsl x24, x4, #3 mov v28.d[0], x14 mov v28.d[1], x24 rev64 v28.16b, v28.16b @@ -21386,10 +21383,10 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -21437,7 +21434,6 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded L_aes_gcm_decrypt_arm64_crypto_eor3_192_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -22911,10 +22907,10 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 aese v14.16b, v0.16b @@ -22970,7 +22966,6 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded L_aes_gcm_decrypt_arm64_crypto_eor3_256_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -24205,10 +24200,10 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte: L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes: L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done: ld1 {v14.2d}, [x12] - ubfiz x8, x8, #3, #32 + lsl x8, x8, #3 rbit x8, x8 mov v28.d[0], x8 - ubfiz x2, x2, #3, #32 + lsl x2, x2, #3 rbit x2, x2 mov v28.d[1], x2 eor v26.16b, v26.16b, v28.16b @@ -24252,7 +24247,6 @@ L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done: ld1 {v28.16b}, [x5] b L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded L_aes_gcm_decrypt_arm64_crypto_eor3_128_part_tag: - ubfiz x6, x6, #0, #32 eor v28.16b, v28.16b, v28.16b mov x17, x6 st1 {v28.2d}, [x11] @@ -24426,7 +24420,7 @@ L_aes_gcm_init_arm64_crypto_end_bytes: # Done GHASH L_aes_gcm_init_arm64_crypto_partial_done: eor x7, x7, x7 - ubfiz x13, x3, #3, #32 + lsl x13, x3, #3 mov v7.d[0], x7 mov v7.d[1], x13 rev64 v7.16b, v7.16b @@ -28788,10 +28782,10 @@ _AES_GCM_encrypt_final_AARCH64: ld1 {v4.2d}, [x5] ushr v6.2d, v6.2d, #56 ld1 {v7.2d}, [x6] - ubfiz x4, x4, #3, #32 + lsl x4, x4, #3 rbit x4, x4 mov v0.d[0], x4 - ubfiz x3, x3, #3, #32 + lsl x3, x3, #3 rbit x3, x3 mov v0.d[1], x3 eor v5.16b, v5.16b, v0.16b @@ -32674,10 +32668,10 @@ _AES_GCM_decrypt_final_AARCH64: ld1 {v4.2d}, [x5] ushr v6.2d, v6.2d, #56 ld1 {v7.2d}, [x6] - ubfiz x4, x4, #3, #32 + lsl x4, x4, #3 rbit x4, x4 mov v0.d[0], x4 - ubfiz x3, x3, #3, #32 + lsl x3, x3, #3 rbit x3, x3 mov v0.d[1], x3 eor v5.16b, v5.16b, v0.16b @@ -32702,7 +32696,6 @@ _AES_GCM_decrypt_final_AARCH64: ld1 {v0.16b}, [x1] b L_aes_gcm_decrypt_final_arm64_crypto_tag_loaded L_aes_gcm_decrypt_final_arm64_crypto_part_tag: - ubfiz x2, x2, #0, #32 eor v0.16b, v0.16b, v0.16b mov x10, x2 st1 {v0.2d}, [x0] @@ -32863,7 +32856,7 @@ L_aes_gcm_init_arm64_crypto_eor3_end_bytes: # Done GHASH L_aes_gcm_init_arm64_crypto_eor3_partial_done: eor x7, x7, x7 - ubfiz x13, x3, #3, #32 + lsl x13, x3, #3 mov v7.d[0], x7 mov v7.d[1], x13 rev64 v7.16b, v7.16b @@ -37121,10 +37114,10 @@ _AES_GCM_encrypt_final_AARCH64_EOR3: ld1 {v4.2d}, [x5] ushr v6.2d, v6.2d, #56 ld1 {v7.2d}, [x6] - ubfiz x4, x4, #3, #32 + lsl x4, x4, #3 rbit x4, x4 mov v0.d[0], x4 - ubfiz x3, x3, #3, #32 + lsl x3, x3, #3 rbit x3, x3 mov v0.d[1], x3 eor v5.16b, v5.16b, v0.16b @@ -40922,10 +40915,10 @@ _AES_GCM_decrypt_final_AARCH64_EOR3: ld1 {v4.2d}, [x5] ushr v6.2d, v6.2d, #56 ld1 {v7.2d}, [x6] - ubfiz x4, x4, #3, #32 + lsl x4, x4, #3 rbit x4, x4 mov v0.d[0], x4 - ubfiz x3, x3, #3, #32 + lsl x3, x3, #3 rbit x3, x3 mov v0.d[1], x3 eor v5.16b, v5.16b, v0.16b @@ -40949,7 +40942,6 @@ _AES_GCM_decrypt_final_AARCH64_EOR3: ld1 {v0.16b}, [x1] b L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_loaded L_aes_gcm_decrypt_final_arm64_crypto_eor3_part_tag: - ubfiz x2, x2, #0, #32 eor v0.16b, v0.16b, v0.16b mov x10, x2 st1 {v0.2d}, [x0] @@ -43124,6 +43116,1073 @@ L_aes_xts_decrypt_arm64_crypto_done: #endif /* __APPLE__ */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_pmull +.type AES_GCMSIV_polyval_pmull,@function +.align 2 +AES_GCMSIV_polyval_pmull: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_pmull +.p2align 2 +_AES_GCMSIV_polyval_pmull: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] + movi v2.16b, #0 + ld1 {v1.16b}, [x1] + ld1 {v0.16b}, [x0] + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 + pmull v18.1q, v1.1d, v1.1d + pmull2 v19.1q, v1.2d, v1.2d + ext v22.16b, v1.16b, v1.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v3.16b, v19.16b, v18.16b + pmull v18.1q, v3.1d, v1.1d + pmull2 v19.1q, v3.2d, v1.2d + ext v22.16b, v3.16b, v3.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v4.16b, v19.16b, v18.16b + pmull v18.1q, v3.1d, v3.1d + pmull2 v19.1q, v3.2d, v3.2d + ext v22.16b, v3.16b, v3.16b, #8 + pmull v23.1q, v22.1d, v3.1d + pmull2 v22.1q, v22.2d, v3.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v5.16b, v19.16b, v18.16b + pmull v18.1q, v5.1d, v1.1d + pmull2 v19.1q, v5.2d, v1.2d + ext v22.16b, v5.16b, v5.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v6.16b, v19.16b, v18.16b + pmull v18.1q, v5.1d, v3.1d + pmull2 v19.1q, v5.2d, v3.2d + ext v22.16b, v5.16b, v5.16b, #8 + pmull v23.1q, v22.1d, v3.1d + pmull2 v22.1q, v22.2d, v3.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v7.16b, v19.16b, v18.16b + pmull v18.1q, v5.1d, v4.1d + pmull2 v19.1q, v5.2d, v4.2d + ext v22.16b, v5.16b, v5.16b, #8 + pmull v23.1q, v22.1d, v4.1d + pmull2 v22.1q, v22.2d, v4.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v8.16b, v19.16b, v18.16b + pmull v18.1q, v5.1d, v5.1d + pmull2 v19.1q, v5.2d, v5.2d + ext v22.16b, v5.16b, v5.16b, #8 + pmull v23.1q, v22.1d, v5.1d + pmull2 v22.1q, v22.2d, v5.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v9.16b, v19.16b, v18.16b +L_AES_GCMSIV_polyval_pmull_loop8: + cmp w3, #8 + blt L_AES_GCMSIV_polyval_pmull_done8 + ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [x2], #0x40 + ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [x2], #0x40 + eor v10.16b, v10.16b, v0.16b + pmull v18.1q, v10.1d, v9.1d + pmull2 v19.1q, v10.2d, v9.2d + ext v22.16b, v10.16b, v10.16b, #8 + pmull v23.1q, v22.1d, v9.1d + pmull2 v22.1q, v22.2d, v9.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + pmull v20.1q, v11.1d, v8.1d + pmull2 v21.1q, v11.2d, v8.2d + ext v22.16b, v11.16b, v11.16b, #8 + pmull v23.1q, v22.1d, v8.1d + pmull2 v22.1q, v22.2d, v8.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v12.1d, v7.1d + pmull2 v21.1q, v12.2d, v7.2d + ext v22.16b, v12.16b, v12.16b, #8 + pmull v23.1q, v22.1d, v7.1d + pmull2 v22.1q, v22.2d, v7.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v13.1d, v6.1d + pmull2 v21.1q, v13.2d, v6.2d + ext v22.16b, v13.16b, v13.16b, #8 + pmull v23.1q, v22.1d, v6.1d + pmull2 v22.1q, v22.2d, v6.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v14.1d, v5.1d + pmull2 v21.1q, v14.2d, v5.2d + ext v22.16b, v14.16b, v14.16b, #8 + pmull v23.1q, v22.1d, v5.1d + pmull2 v22.1q, v22.2d, v5.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v15.1d, v4.1d + pmull2 v21.1q, v15.2d, v4.2d + ext v22.16b, v15.16b, v15.16b, #8 + pmull v23.1q, v22.1d, v4.1d + pmull2 v22.1q, v22.2d, v4.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v16.1d, v3.1d + pmull2 v21.1q, v16.2d, v3.2d + ext v22.16b, v16.16b, v16.16b, #8 + pmull v23.1q, v22.1d, v3.1d + pmull2 v22.1q, v22.2d, v3.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v17.1d, v1.1d + pmull2 v21.1q, v17.2d, v1.2d + ext v22.16b, v17.16b, v17.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v0.16b, v19.16b, v18.16b + sub w3, w3, #8 + b L_AES_GCMSIV_polyval_pmull_loop8 +L_AES_GCMSIV_polyval_pmull_done8: +L_AES_GCMSIV_polyval_pmull_loop4: + cmp w3, #4 + blt L_AES_GCMSIV_polyval_pmull_done4 + ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [x2], #0x40 + eor v10.16b, v10.16b, v0.16b + pmull v18.1q, v10.1d, v5.1d + pmull2 v19.1q, v10.2d, v5.2d + ext v22.16b, v10.16b, v10.16b, #8 + pmull v23.1q, v22.1d, v5.1d + pmull2 v22.1q, v22.2d, v5.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + pmull v20.1q, v11.1d, v4.1d + pmull2 v21.1q, v11.2d, v4.2d + ext v22.16b, v11.16b, v11.16b, #8 + pmull v23.1q, v22.1d, v4.1d + pmull2 v22.1q, v22.2d, v4.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v12.1d, v3.1d + pmull2 v21.1q, v12.2d, v3.2d + ext v22.16b, v12.16b, v12.16b, #8 + pmull v23.1q, v22.1d, v3.1d + pmull2 v22.1q, v22.2d, v3.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + pmull v20.1q, v13.1d, v1.1d + pmull2 v21.1q, v13.2d, v1.2d + ext v22.16b, v13.16b, v13.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v20.16b, v20.16b, v23.16b + eor v21.16b, v21.16b, v22.16b + eor v18.16b, v18.16b, v20.16b + eor v19.16b, v19.16b, v21.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v0.16b, v19.16b, v18.16b + sub w3, w3, #4 + b L_AES_GCMSIV_polyval_pmull_loop4 +L_AES_GCMSIV_polyval_pmull_done4: +L_AES_GCMSIV_polyval_pmull_rem: + cbz w3, L_AES_GCMSIV_polyval_pmull_done + ld1 {v10.16b}, [x2], #16 + eor v0.16b, v0.16b, v10.16b + pmull v18.1q, v0.1d, v1.1d + pmull2 v19.1q, v0.2d, v1.2d + ext v22.16b, v0.16b, v0.16b, #8 + pmull v23.1q, v22.1d, v1.1d + pmull2 v22.1q, v22.2d, v1.2d + eor v22.16b, v22.16b, v23.16b + ext v23.16b, v2.16b, v22.16b, #8 + ext v22.16b, v22.16b, v2.16b, #8 + eor v18.16b, v18.16b, v23.16b + eor v19.16b, v19.16b, v22.16b + ushr v24.4s, v18.4s, #31 + ushr v25.4s, v19.4s, #31 + shl v18.4s, v18.4s, #1 + shl v19.4s, v19.4s, #1 + ext v26.16b, v24.16b, v2.16b, #12 + ext v25.16b, v2.16b, v25.16b, #12 + ext v24.16b, v2.16b, v24.16b, #12 + orr v18.16b, v18.16b, v24.16b + orr v19.16b, v19.16b, v25.16b + orr v19.16b, v19.16b, v26.16b + shl v24.4s, v18.4s, #31 + shl v25.4s, v18.4s, #30 + shl v26.4s, v18.4s, #25 + eor v24.16b, v24.16b, v25.16b + eor v24.16b, v24.16b, v26.16b + ext v25.16b, v24.16b, v2.16b, #4 + ext v24.16b, v2.16b, v24.16b, #4 + eor v18.16b, v18.16b, v24.16b + ushr v24.4s, v18.4s, #1 + ushr v26.4s, v18.4s, #2 + ushr v27.4s, v18.4s, #7 + eor v24.16b, v24.16b, v26.16b + eor v24.16b, v24.16b, v27.16b + eor v24.16b, v24.16b, v25.16b + eor v18.16b, v18.16b, v24.16b + eor v0.16b, v19.16b, v18.16b + subs w3, w3, #1 + bne L_AES_GCMSIV_polyval_pmull_rem +L_AES_GCMSIV_polyval_pmull_done: + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_GCMSIV_polyval_pmull,.-AES_GCMSIV_polyval_pmull +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_aarch64 +.type AES_GCMSIV_ctr_aarch64,@function +.align 2 +AES_GCMSIV_ctr_aarch64: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_aarch64 +.p2align 2 +_AES_GCMSIV_ctr_aarch64: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] + ld1 {v15.2d}, [x5] + mov w7, v15.s[0] + lsr x6, x2, #4 + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x3], #0x40 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x3], #0x40 + cmp w4, #12 + blt L_AES_GCMSIV_ctr_aarch64_start_128 + bgt L_AES_GCMSIV_ctr_aarch64_start_256 + # AES_GCMSIV_CTR_192 +#ifndef NO_AES_192 + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x3], #0x40 + ld1 {v12.2d}, [x3] +L_AES_GCMSIV_ctr_aarch64_192_loop4: + cmp x6, #4 + blt L_AES_GCMSIV_ctr_aarch64_192_done4 + mov v16.16b, v15.16b + mov v16.s[0], w7 + mov v17.16b, v15.16b + add w8, w7, #1 + mov v17.s[0], w8 + mov v18.16b, v15.16b + add w8, w7, #2 + mov v18.s[0], w8 + mov v19.16b, v15.16b + add w8, w7, #3 + mov v19.s[0], w8 + add w7, w7, #4 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x0], #0x40 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v17.16b, v0.16b + aesmc v17.16b, v17.16b + aese v18.16b, v0.16b + aesmc v18.16b, v18.16b + aese v19.16b, v0.16b + aesmc v19.16b, v19.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v17.16b, v1.16b + aesmc v17.16b, v17.16b + aese v18.16b, v1.16b + aesmc v18.16b, v18.16b + aese v19.16b, v1.16b + aesmc v19.16b, v19.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v17.16b, v2.16b + aesmc v17.16b, v17.16b + aese v18.16b, v2.16b + aesmc v18.16b, v18.16b + aese v19.16b, v2.16b + aesmc v19.16b, v19.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v17.16b, v3.16b + aesmc v17.16b, v17.16b + aese v18.16b, v3.16b + aesmc v18.16b, v18.16b + aese v19.16b, v3.16b + aesmc v19.16b, v19.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v17.16b, v4.16b + aesmc v17.16b, v17.16b + aese v18.16b, v4.16b + aesmc v18.16b, v18.16b + aese v19.16b, v4.16b + aesmc v19.16b, v19.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v17.16b, v5.16b + aesmc v17.16b, v17.16b + aese v18.16b, v5.16b + aesmc v18.16b, v18.16b + aese v19.16b, v5.16b + aesmc v19.16b, v19.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v17.16b, v6.16b + aesmc v17.16b, v17.16b + aese v18.16b, v6.16b + aesmc v18.16b, v18.16b + aese v19.16b, v6.16b + aesmc v19.16b, v19.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v17.16b, v7.16b + aesmc v17.16b, v17.16b + aese v18.16b, v7.16b + aesmc v18.16b, v18.16b + aese v19.16b, v7.16b + aesmc v19.16b, v19.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v17.16b, v8.16b + aesmc v17.16b, v17.16b + aese v18.16b, v8.16b + aesmc v18.16b, v18.16b + aese v19.16b, v8.16b + aesmc v19.16b, v19.16b + aese v16.16b, v9.16b + aesmc v16.16b, v16.16b + aese v17.16b, v9.16b + aesmc v17.16b, v17.16b + aese v18.16b, v9.16b + aesmc v18.16b, v18.16b + aese v19.16b, v9.16b + aesmc v19.16b, v19.16b + aese v16.16b, v10.16b + aesmc v16.16b, v16.16b + aese v17.16b, v10.16b + aesmc v17.16b, v17.16b + aese v18.16b, v10.16b + aesmc v18.16b, v18.16b + aese v19.16b, v10.16b + aesmc v19.16b, v19.16b + aese v16.16b, v11.16b + eor v16.16b, v16.16b, v12.16b + aese v17.16b, v11.16b + eor v17.16b, v17.16b, v12.16b + aese v18.16b, v11.16b + eor v18.16b, v18.16b, v12.16b + aese v19.16b, v11.16b + eor v19.16b, v19.16b, v12.16b + eor v20.16b, v20.16b, v16.16b + eor v21.16b, v21.16b, v17.16b + eor v22.16b, v22.16b, v18.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + sub x6, x6, #4 + cmp x6, #4 + bge L_AES_GCMSIV_ctr_aarch64_192_loop4 +L_AES_GCMSIV_ctr_aarch64_192_done4: +L_AES_GCMSIV_ctr_aarch64_192_loop1: + cbz x6, L_AES_GCMSIV_ctr_aarch64_192_done1 + mov v16.16b, v15.16b + mov v16.s[0], w7 + add w7, w7, #1 + ld1 {v20.16b}, [x0], #16 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v16.16b, v9.16b + aesmc v16.16b, v16.16b + aese v16.16b, v10.16b + aesmc v16.16b, v16.16b + aese v16.16b, v11.16b + eor v16.16b, v16.16b, v12.16b + eor v20.16b, v20.16b, v16.16b + st1 {v20.16b}, [x1], #16 + sub x6, x6, #1 + b L_AES_GCMSIV_ctr_aarch64_192_loop1 +L_AES_GCMSIV_ctr_aarch64_192_done1: +#endif /* !NO_AES_192 */ + b L_AES_GCMSIV_ctr_aarch64_done + # AES_GCMSIV_CTR_256 +L_AES_GCMSIV_ctr_aarch64_start_256: +#ifndef NO_AES_256 + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x3], #0x40 + ld1 {v12.2d, v13.2d}, [x3], #32 + ld1 {v14.2d}, [x3] +L_AES_GCMSIV_ctr_aarch64_256_loop4: + cmp x6, #4 + blt L_AES_GCMSIV_ctr_aarch64_256_done4 + mov v16.16b, v15.16b + mov v16.s[0], w7 + mov v17.16b, v15.16b + add w8, w7, #1 + mov v17.s[0], w8 + mov v18.16b, v15.16b + add w8, w7, #2 + mov v18.s[0], w8 + mov v19.16b, v15.16b + add w8, w7, #3 + mov v19.s[0], w8 + add w7, w7, #4 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x0], #0x40 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v17.16b, v0.16b + aesmc v17.16b, v17.16b + aese v18.16b, v0.16b + aesmc v18.16b, v18.16b + aese v19.16b, v0.16b + aesmc v19.16b, v19.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v17.16b, v1.16b + aesmc v17.16b, v17.16b + aese v18.16b, v1.16b + aesmc v18.16b, v18.16b + aese v19.16b, v1.16b + aesmc v19.16b, v19.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v17.16b, v2.16b + aesmc v17.16b, v17.16b + aese v18.16b, v2.16b + aesmc v18.16b, v18.16b + aese v19.16b, v2.16b + aesmc v19.16b, v19.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v17.16b, v3.16b + aesmc v17.16b, v17.16b + aese v18.16b, v3.16b + aesmc v18.16b, v18.16b + aese v19.16b, v3.16b + aesmc v19.16b, v19.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v17.16b, v4.16b + aesmc v17.16b, v17.16b + aese v18.16b, v4.16b + aesmc v18.16b, v18.16b + aese v19.16b, v4.16b + aesmc v19.16b, v19.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v17.16b, v5.16b + aesmc v17.16b, v17.16b + aese v18.16b, v5.16b + aesmc v18.16b, v18.16b + aese v19.16b, v5.16b + aesmc v19.16b, v19.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v17.16b, v6.16b + aesmc v17.16b, v17.16b + aese v18.16b, v6.16b + aesmc v18.16b, v18.16b + aese v19.16b, v6.16b + aesmc v19.16b, v19.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v17.16b, v7.16b + aesmc v17.16b, v17.16b + aese v18.16b, v7.16b + aesmc v18.16b, v18.16b + aese v19.16b, v7.16b + aesmc v19.16b, v19.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v17.16b, v8.16b + aesmc v17.16b, v17.16b + aese v18.16b, v8.16b + aesmc v18.16b, v18.16b + aese v19.16b, v8.16b + aesmc v19.16b, v19.16b + aese v16.16b, v9.16b + aesmc v16.16b, v16.16b + aese v17.16b, v9.16b + aesmc v17.16b, v17.16b + aese v18.16b, v9.16b + aesmc v18.16b, v18.16b + aese v19.16b, v9.16b + aesmc v19.16b, v19.16b + aese v16.16b, v10.16b + aesmc v16.16b, v16.16b + aese v17.16b, v10.16b + aesmc v17.16b, v17.16b + aese v18.16b, v10.16b + aesmc v18.16b, v18.16b + aese v19.16b, v10.16b + aesmc v19.16b, v19.16b + aese v16.16b, v11.16b + aesmc v16.16b, v16.16b + aese v17.16b, v11.16b + aesmc v17.16b, v17.16b + aese v18.16b, v11.16b + aesmc v18.16b, v18.16b + aese v19.16b, v11.16b + aesmc v19.16b, v19.16b + aese v16.16b, v12.16b + aesmc v16.16b, v16.16b + aese v17.16b, v12.16b + aesmc v17.16b, v17.16b + aese v18.16b, v12.16b + aesmc v18.16b, v18.16b + aese v19.16b, v12.16b + aesmc v19.16b, v19.16b + aese v16.16b, v13.16b + eor v16.16b, v16.16b, v14.16b + aese v17.16b, v13.16b + eor v17.16b, v17.16b, v14.16b + aese v18.16b, v13.16b + eor v18.16b, v18.16b, v14.16b + aese v19.16b, v13.16b + eor v19.16b, v19.16b, v14.16b + eor v20.16b, v20.16b, v16.16b + eor v21.16b, v21.16b, v17.16b + eor v22.16b, v22.16b, v18.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + sub x6, x6, #4 + cmp x6, #4 + bge L_AES_GCMSIV_ctr_aarch64_256_loop4 +L_AES_GCMSIV_ctr_aarch64_256_done4: +L_AES_GCMSIV_ctr_aarch64_256_loop1: + cbz x6, L_AES_GCMSIV_ctr_aarch64_256_done1 + mov v16.16b, v15.16b + mov v16.s[0], w7 + add w7, w7, #1 + ld1 {v20.16b}, [x0], #16 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v16.16b, v9.16b + aesmc v16.16b, v16.16b + aese v16.16b, v10.16b + aesmc v16.16b, v16.16b + aese v16.16b, v11.16b + aesmc v16.16b, v16.16b + aese v16.16b, v12.16b + aesmc v16.16b, v16.16b + aese v16.16b, v13.16b + eor v16.16b, v16.16b, v14.16b + eor v20.16b, v20.16b, v16.16b + st1 {v20.16b}, [x1], #16 + sub x6, x6, #1 + b L_AES_GCMSIV_ctr_aarch64_256_loop1 +L_AES_GCMSIV_ctr_aarch64_256_done1: +#endif /* !NO_AES_256 */ + b L_AES_GCMSIV_ctr_aarch64_done + # AES_GCMSIV_CTR_128 +L_AES_GCMSIV_ctr_aarch64_start_128: +#ifndef NO_AES_128 + ld1 {v8.2d, v9.2d}, [x3], #32 + ld1 {v10.2d}, [x3] +L_AES_GCMSIV_ctr_aarch64_128_loop4: + cmp x6, #4 + blt L_AES_GCMSIV_ctr_aarch64_128_done4 + mov v16.16b, v15.16b + mov v16.s[0], w7 + mov v17.16b, v15.16b + add w8, w7, #1 + mov v17.s[0], w8 + mov v18.16b, v15.16b + add w8, w7, #2 + mov v18.s[0], w8 + mov v19.16b, v15.16b + add w8, w7, #3 + mov v19.s[0], w8 + add w7, w7, #4 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x0], #0x40 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v17.16b, v0.16b + aesmc v17.16b, v17.16b + aese v18.16b, v0.16b + aesmc v18.16b, v18.16b + aese v19.16b, v0.16b + aesmc v19.16b, v19.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v17.16b, v1.16b + aesmc v17.16b, v17.16b + aese v18.16b, v1.16b + aesmc v18.16b, v18.16b + aese v19.16b, v1.16b + aesmc v19.16b, v19.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v17.16b, v2.16b + aesmc v17.16b, v17.16b + aese v18.16b, v2.16b + aesmc v18.16b, v18.16b + aese v19.16b, v2.16b + aesmc v19.16b, v19.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v17.16b, v3.16b + aesmc v17.16b, v17.16b + aese v18.16b, v3.16b + aesmc v18.16b, v18.16b + aese v19.16b, v3.16b + aesmc v19.16b, v19.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v17.16b, v4.16b + aesmc v17.16b, v17.16b + aese v18.16b, v4.16b + aesmc v18.16b, v18.16b + aese v19.16b, v4.16b + aesmc v19.16b, v19.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v17.16b, v5.16b + aesmc v17.16b, v17.16b + aese v18.16b, v5.16b + aesmc v18.16b, v18.16b + aese v19.16b, v5.16b + aesmc v19.16b, v19.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v17.16b, v6.16b + aesmc v17.16b, v17.16b + aese v18.16b, v6.16b + aesmc v18.16b, v18.16b + aese v19.16b, v6.16b + aesmc v19.16b, v19.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v17.16b, v7.16b + aesmc v17.16b, v17.16b + aese v18.16b, v7.16b + aesmc v18.16b, v18.16b + aese v19.16b, v7.16b + aesmc v19.16b, v19.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v17.16b, v8.16b + aesmc v17.16b, v17.16b + aese v18.16b, v8.16b + aesmc v18.16b, v18.16b + aese v19.16b, v8.16b + aesmc v19.16b, v19.16b + aese v16.16b, v9.16b + eor v16.16b, v16.16b, v10.16b + aese v17.16b, v9.16b + eor v17.16b, v17.16b, v10.16b + aese v18.16b, v9.16b + eor v18.16b, v18.16b, v10.16b + aese v19.16b, v9.16b + eor v19.16b, v19.16b, v10.16b + eor v20.16b, v20.16b, v16.16b + eor v21.16b, v21.16b, v17.16b + eor v22.16b, v22.16b, v18.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + sub x6, x6, #4 + cmp x6, #4 + bge L_AES_GCMSIV_ctr_aarch64_128_loop4 +L_AES_GCMSIV_ctr_aarch64_128_done4: +L_AES_GCMSIV_ctr_aarch64_128_loop1: + cbz x6, L_AES_GCMSIV_ctr_aarch64_128_done1 + mov v16.16b, v15.16b + mov v16.s[0], w7 + add w7, w7, #1 + ld1 {v20.16b}, [x0], #16 + aese v16.16b, v0.16b + aesmc v16.16b, v16.16b + aese v16.16b, v1.16b + aesmc v16.16b, v16.16b + aese v16.16b, v2.16b + aesmc v16.16b, v16.16b + aese v16.16b, v3.16b + aesmc v16.16b, v16.16b + aese v16.16b, v4.16b + aesmc v16.16b, v16.16b + aese v16.16b, v5.16b + aesmc v16.16b, v16.16b + aese v16.16b, v6.16b + aesmc v16.16b, v16.16b + aese v16.16b, v7.16b + aesmc v16.16b, v16.16b + aese v16.16b, v8.16b + aesmc v16.16b, v16.16b + aese v16.16b, v9.16b + eor v16.16b, v16.16b, v10.16b + eor v20.16b, v20.16b, v16.16b + st1 {v20.16b}, [x1], #16 + sub x6, x6, #1 + b L_AES_GCMSIV_ctr_aarch64_128_loop1 +L_AES_GCMSIV_ctr_aarch64_128_done1: +#endif /* !NO_AES_128 */ +L_AES_GCMSIV_ctr_aarch64_done: + mov v15.s[0], w7 + st1 {v15.2d}, [x5] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_GCMSIV_ctr_aarch64,.-AES_GCMSIV_ctr_aarch64 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #ifndef WOLFSSL_ARMASM_NO_NEON #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ @@ -51290,6 +52349,1226 @@ L_AES_XTS_decrypt_NEON_data_done: #endif /* __APPLE__ */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_neon +.type AES_GCMSIV_polyval_neon,@function +.align 2 +AES_GCMSIV_polyval_neon: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_neon +.p2align 2 +_AES_GCMSIV_polyval_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] + ld1 {v18.2d}, [x0] + ld1 {v10.2d}, [x1] + movi v19.16b, #15 + eor v20.16b, v20.16b, v20.16b + rev64 v10.16b, v10.16b + ext v10.16b, v10.16b, v10.16b, #8 + rbit v10.16b, v10.16b + rbit v18.16b, v18.16b + and v12.16b, v10.16b, v19.16b + ushr v13.16b, v10.16b, #4 + eor v14.16b, v12.16b, v13.16b + cbz w3, L_AES_GCMSIV_polyval_neon_done +L_AES_GCMSIV_polyval_neon_loop: + ld1 {v0.16b}, [x2], #16 + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 + rbit v0.16b, v0.16b + eor v18.16b, v18.16b, v0.16b + and v15.16b, v18.16b, v19.16b + ushr v16.16b, v18.16b, #4 + eor v17.16b, v15.16b, v16.16b + dup v0.16b, v12.b[0] + dup v2.16b, v14.b[0] + dup v1.16b, v13.b[0] + pmul v8.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v8.16b + eor v5.16b, v5.16b, v4.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v8.16b, v8.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + dup v0.16b, v12.b[1] + dup v2.16b, v14.b[1] + dup v1.16b, v13.b[1] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v6.16b, v20.16b, v3.16b, #15 + ext v9.16b, v3.16b, v20.16b, #15 + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[2] + dup v2.16b, v14.b[2] + dup v1.16b, v13.b[2] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #14 + ext v6.16b, v20.16b, v3.16b, #14 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[3] + dup v2.16b, v14.b[3] + dup v1.16b, v13.b[3] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #13 + ext v6.16b, v20.16b, v3.16b, #13 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[4] + dup v2.16b, v14.b[4] + dup v1.16b, v13.b[4] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #12 + ext v6.16b, v20.16b, v3.16b, #12 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[5] + dup v2.16b, v14.b[5] + dup v1.16b, v13.b[5] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #11 + ext v6.16b, v20.16b, v3.16b, #11 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[6] + dup v2.16b, v14.b[6] + dup v1.16b, v13.b[6] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #10 + ext v6.16b, v20.16b, v3.16b, #10 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[7] + dup v2.16b, v14.b[7] + dup v1.16b, v13.b[7] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #9 + ext v6.16b, v20.16b, v3.16b, #9 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[8] + dup v2.16b, v14.b[8] + dup v1.16b, v13.b[8] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #8 + ext v6.16b, v20.16b, v3.16b, #8 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[9] + dup v2.16b, v14.b[9] + dup v1.16b, v13.b[9] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #7 + ext v6.16b, v20.16b, v3.16b, #7 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[10] + dup v2.16b, v14.b[10] + dup v1.16b, v13.b[10] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #6 + ext v6.16b, v20.16b, v3.16b, #6 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[11] + dup v2.16b, v14.b[11] + dup v1.16b, v13.b[11] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #5 + ext v6.16b, v20.16b, v3.16b, #5 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[12] + dup v2.16b, v14.b[12] + dup v1.16b, v13.b[12] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #4 + ext v6.16b, v20.16b, v3.16b, #4 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[13] + dup v2.16b, v14.b[13] + dup v1.16b, v13.b[13] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #3 + ext v6.16b, v20.16b, v3.16b, #3 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[14] + dup v2.16b, v14.b[14] + dup v1.16b, v13.b[14] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #2 + ext v6.16b, v20.16b, v3.16b, #2 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[15] + dup v2.16b, v14.b[15] + dup v1.16b, v13.b[15] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #1 + ext v6.16b, v20.16b, v3.16b, #1 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v11.16b + shl v0.16b, v9.16b, #1 + shl v1.16b, v9.16b, #2 + shl v2.16b, v9.16b, #7 + ushr v3.16b, v9.16b, #7 + ushr v4.16b, v9.16b, #6 + ushr v5.16b, v9.16b, #1 + eor v0.16b, v0.16b, v9.16b + eor v1.16b, v1.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + eor v8.16b, v8.16b, v0.16b + ext v0.16b, v20.16b, v3.16b, #15 + ext v1.16b, v20.16b, v4.16b, #15 + ext v2.16b, v20.16b, v5.16b, #15 + ext v4.16b, v4.16b, v20.16b, #15 + ext v5.16b, v5.16b, v20.16b, #15 + eor v0.16b, v0.16b, v1.16b + eor v8.16b, v8.16b, v2.16b + eor v8.16b, v8.16b, v0.16b + eor v3.16b, v4.16b, v5.16b + shl v0.2d, v3.2d, #1 + shl v1.2d, v3.2d, #2 + shl v2.2d, v3.2d, #7 + eor v3.16b, v3.16b, v0.16b + eor v1.16b, v1.16b, v2.16b + eor v8.16b, v8.16b, v3.16b + eor v18.16b, v8.16b, v1.16b + subs w3, w3, #1 + bne L_AES_GCMSIV_polyval_neon_loop +L_AES_GCMSIV_polyval_neon_done: + rbit v18.16b, v18.16b + st1 {v18.2d}, [x0] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_GCMSIV_polyval_neon,.-AES_GCMSIV_polyval_neon +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .section .rodata + .type L_AES_GCMSIV_ctr_neon_te, %object + .size L_AES_GCMSIV_ctr_neon_te, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_neon_te: + .byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 + .byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 + .byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 + .byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 + .byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc + .byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 + .byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a + .byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 + .byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 + .byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 + .byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b + .byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf + .byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 + .byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 + .byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 + .byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 + .byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 + .byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 + .byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 + .byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb + .byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c + .byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 + .byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 + .byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 + .byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 + .byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a + .byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e + .byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e + .byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 + .byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf + .byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 + .byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +#ifndef __APPLE__ + .text + .section .rodata + .type L_AES_GCMSIV_ctr_neon_shuffle, %object + .size L_AES_GCMSIV_ctr_neon_shuffle, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_neon_shuffle: + .byte 0x0c,0x09,0x06,0x03,0x00,0x0d,0x0a,0x07 + .byte 0x04,0x01,0x0e,0x0b,0x08,0x05,0x02,0x0f +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_neon +.type AES_GCMSIV_ctr_neon,@function +.align 2 +AES_GCMSIV_ctr_neon: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_neon +.p2align 2 +_AES_GCMSIV_ctr_neon: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x6, L_AES_GCMSIV_ctr_neon_te + add x6, x6, :lo12:L_AES_GCMSIV_ctr_neon_te +#else + adrp x6, L_AES_GCMSIV_ctr_neon_te@PAGE + add x6, x6, L_AES_GCMSIV_ctr_neon_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_AES_GCMSIV_ctr_neon_shuffle + add x7, x7, :lo12:L_AES_GCMSIV_ctr_neon_shuffle +#else + adrp x7, L_AES_GCMSIV_ctr_neon_shuffle@PAGE + add x7, x7, L_AES_GCMSIV_ctr_neon_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x6], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x6], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x6], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + ldr w10, [x5] + cmp x2, #0x40 + blt L_AES_GCMSIV_ctr_neon_start_2 +L_AES_GCMSIV_ctr_neon_loop_4: + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + # Round: 0 - build counters and XOR in key schedule + ld1 {v0.2d}, [x5] + mov v0.s[0], w10 + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + ld1 {v1.2d}, [x5] + add w8, w10, #1 + mov v1.s[0], w8 + rev32 v1.16b, v1.16b + eor v1.16b, v1.16b, v4.16b + ld1 {v2.2d}, [x5] + add w8, w10, #2 + mov v2.s[0], w8 + rev32 v2.16b, v2.16b + eor v2.16b, v2.16b, v4.16b + ld1 {v3.2d}, [x5] + add w8, w10, #3 + mov v3.s[0], w8 + rev32 v3.16b, v3.16b + eor v3.16b, v3.16b, v4.16b + add w10, w10, #4 + sub w8, w4, #2 +L_AES_GCMSIV_ctr_neon_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + movi v4.16b, #27 + and v8.16b, v8.16b, v4.16b + and v9.16b, v9.16b, v4.16b + and v10.16b, v10.16b, v4.16b + and v11.16b, v11.16b, v4.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + eor v6.16b, v10.16b, v2.16b + eor v7.16b, v11.16b, v3.16b + shl v12.4s, v4.4s, #8 + shl v13.4s, v5.4s, #8 + shl v14.4s, v6.4s, #8 + shl v15.4s, v7.4s, #8 + sri v12.4s, v4.4s, #24 + sri v13.4s, v5.4s, #24 + sri v14.4s, v6.4s, #24 + sri v15.4s, v7.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + shl v6.4s, v2.4s, #24 + shl v7.4s, v3.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + sri v6.4s, v2.4s, #8 + sri v7.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + # Round Done + subs w8, w8, #2 + bne L_AES_GCMSIV_ctr_neon_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_GCMSIV_ctr_neon_loop_4 +L_AES_GCMSIV_ctr_neon_start_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + cmp x2, #16 + beq L_AES_GCMSIV_ctr_neon_start_1 + blt L_AES_GCMSIV_ctr_neon_data_done +L_AES_GCMSIV_ctr_neon_loop_2: + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + # Round: 0 - build counters and XOR in key schedule + ld1 {v0.2d}, [x5] + mov v0.s[0], w10 + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + ld1 {v1.2d}, [x5] + add w8, w10, #1 + mov v1.s[0], w8 + rev32 v1.16b, v1.16b + eor v1.16b, v1.16b, v4.16b + add w10, w10, #2 + sub w8, w4, #2 +L_AES_GCMSIV_ctr_neon_loop_nr_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v10.16b, v0.16b, #1 + shl v11.16b, v1.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + shl v10.4s, v4.4s, #8 + shl v11.4s, v5.4s, #8 + sri v10.4s, v4.4s, #24 + sri v11.4s, v5.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # Round Done + subs w8, w8, #2 + bne L_AES_GCMSIV_ctr_neon_loop_nr_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + ld1 {v4.16b, v5.16b}, [x0], #32 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + st1 {v0.16b, v1.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #0 + beq L_AES_GCMSIV_ctr_neon_data_done +L_AES_GCMSIV_ctr_neon_start_1: + ld1 {v3.2d}, [x7] + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + # Round: 0 - build counter and XOR in key schedule + ld1 {v0.2d}, [x5] + mov v0.s[0], w10 + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + add w10, w10, #1 + sub w8, w4, #2 +L_AES_GCMSIV_ctr_neon_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x9], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x9], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w8, w8, #2 + bne L_AES_GCMSIV_ctr_neon_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x9], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x9], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + ld1 {v4.16b}, [x0], #16 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x1], #16 +L_AES_GCMSIV_ctr_neon_data_done: + str w10, [x5] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_GCMSIV_ctr_neon,.-AES_GCMSIV_ctr_neon +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NO_NEON */ #ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP #ifdef HAVE_AES_DECRYPT @@ -56786,6 +59065,840 @@ L_AES_XTS_decrypt_done_data: #endif /* __APPLE__ */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ + .text + .section .rodata + .type L_AES_GCMSIV_polyval_base_r, %object + .size L_AES_GCMSIV_polyval_base_r, 128 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 16-byte aligned, 128-bit aligned +#ifndef __APPLE__ + .align 4 +#else + .p2align 4 +#endif /* __APPLE__ */ +L_AES_GCMSIV_polyval_base_r: + .quad 0x0000000000000000,0x1c20000000000000 + .quad 0x3840000000000000,0x2460000000000000 + .quad 0x7080000000000000,0x6ca0000000000000 + .quad 0x48c0000000000000,0x54e0000000000000 + .quad 0xe100000000000000,0xfd20000000000000 + .quad 0xd940000000000000,0xc560000000000000 + .quad 0x9180000000000000,0x8da0000000000000 + .quad 0xa9c0000000000000,0xb5e0000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_polyval_base +.type AES_GCMSIV_polyval_base,@function +.align 2 +AES_GCMSIV_polyval_base: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_polyval_base +.p2align 2 +_AES_GCMSIV_polyval_base: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x15, L_AES_GCMSIV_polyval_base_r + add x15, x15, :lo12:L_AES_GCMSIV_polyval_base_r +#else + adrp x15, L_AES_GCMSIV_polyval_base_r@PAGE + add x15, x15, L_AES_GCMSIV_polyval_base_r@PAGEOFF +#endif /* __APPLE__ */ + cbz w3, L_AES_GCMSIV_polyval_base_done +L_AES_GCMSIV_polyval_base_loop: + ldp x6, x7, [x2] + ldp x4, x5, [x0] + rev x6, x6 + rev x7, x7 + eor x4, x4, x7 + eor x5, x5, x6 + eor x8, x8, x8 + eor x9, x9, x9 + ubfx x12, x5, #56, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #60, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #48, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #52, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #40, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #44, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #32, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #36, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #24, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #28, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #16, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #20, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #8, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #12, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #0, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x5, #4, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #56, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #60, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #48, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #52, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #40, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #44, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #32, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #36, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #24, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #28, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #16, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #20, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #8, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #12, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #0, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + and x13, x9, #15 + lsr x9, x9, #4 + orr x9, x9, x8, lsl 60 + lsr x8, x8, #4 + ldr x10, [x15, x13, LSL 3] + eor x8, x8, x10 + ubfx x12, x4, #4, #4 + add x14, x1, x12, lsl 4 + ldp x10, x11, [x14] + eor x8, x8, x10 + eor x9, x9, x11 + rev x8, x8 + rev x9, x9 + stp x8, x9, [x0] + subs w3, w3, #1 + add x2, x2, #16 + bne L_AES_GCMSIV_polyval_base_loop +L_AES_GCMSIV_polyval_base_done: + ret +#ifndef __APPLE__ + .size AES_GCMSIV_polyval_base,.-AES_GCMSIV_polyval_base +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .text + .section .rodata + .type L_AES_GCMSIV_ctr_base_te, %object + .size L_AES_GCMSIV_ctr_base_te, 1024 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + # 8-byte aligned, 64-bit aligned +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_base_te: + .long 0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b + .long 0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5 + .long 0x50603030,0x03020101,0xa9ce6767,0x7d562b2b + .long 0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676 + .long 0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d + .long 0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0 + .long 0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf + .long 0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0 + .long 0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626 + .long 0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc + .long 0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1 + .long 0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515 + .long 0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3 + .long 0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a + .long 0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2 + .long 0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575 + .long 0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a + .long 0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0 + .long 0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3 + .long 0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484 + .long 0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded + .long 0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b + .long 0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939 + .long 0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf + .long 0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb + .long 0xc5864343,0xd79a4d4d,0x55663333,0x94118585 + .long 0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f + .long 0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8 + .long 0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f + .long 0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5 + .long 0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121 + .long 0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2 + .long 0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec + .long 0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717 + .long 0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d + .long 0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373 + .long 0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc + .long 0x66442222,0x7e542a2a,0xab3b9090,0x830b8888 + .long 0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414 + .long 0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb + .long 0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a + .long 0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c + .long 0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262 + .long 0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979 + .long 0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d + .long 0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9 + .long 0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea + .long 0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808 + .long 0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e + .long 0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6 + .long 0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f + .long 0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a + .long 0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666 + .long 0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e + .long 0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9 + .long 0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e + .long 0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111 + .long 0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494 + .long 0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9 + .long 0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf + .long 0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d + .long 0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868 + .long 0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f + .long 0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616 +#ifndef __APPLE__ +.text +.globl AES_GCMSIV_ctr_base +.type AES_GCMSIV_ctr_base,@function +.align 2 +AES_GCMSIV_ctr_base: +#else +.section __TEXT,__text +.globl _AES_GCMSIV_ctr_base +.p2align 2 +_AES_GCMSIV_ctr_base: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + stp x20, x21, [x29, #40] + str x22, [x29, #56] +#ifndef __APPLE__ + adrp x6, L_AES_GCMSIV_ctr_base_te + add x6, x6, :lo12:L_AES_GCMSIV_ctr_base_te +#else + adrp x6, L_AES_GCMSIV_ctr_base_te@PAGE + add x6, x6, L_AES_GCMSIV_ctr_base_te@PAGEOFF +#endif /* __APPLE__ */ + ldp x15, x16, [x5] + mov w17, w15 + cbz x2, L_AES_GCMSIV_ctr_base_done +L_AES_GCMSIV_ctr_base_loop_block: + mov x22, x3 + ldp x11, x12, [x22], #16 + # Round: 0 - set counter, XOR in key schedule + bfi x15, x17, #0, #32 + rev32 x7, x15 + rev32 x8, x16 + eor x7, x7, x11 + eor x8, x8, x12 + sub w21, w4, #2 +L_AES_GCMSIV_ctr_base_loop_nr: + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x19, x8, #8, #8 + ubfx x20, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w19, ror 8 + ubfx x19, x8, #40, #8 + eor w11, w11, w20, ror 16 + ubfx x20, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w19, ror 8 + ubfx x19, x7, #8, #8 + eor w12, w12, w20, ror 16 + ubfx x20, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w19, ror 8 + ubfx x19, x8, #56, #8 + eor w12, w13, w20, ror 16 + ubfx x20, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w20, [x6, x20, LSL 2] + eor w19, w19, w9, ror 24 + ldp x7, x8, [x22], #16 + eor w14, w14, w19, ror 24 + eor w14, w14, w20, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x11, #48, #8 + ubfx x10, x11, #24, #8 + ubfx x19, x12, #8, #8 + ubfx x20, x12, #32, #8 + ldr x13, [x6] + ldr x13, [x6, #64] + ldr x13, [x6, #128] + ldr x13, [x6, #192] + ldr x13, [x6, #256] + ldr x13, [x6, #320] + ldr x13, [x6, #384] + ldr x13, [x6, #448] + ldr x13, [x6, #512] + ldr x13, [x6, #576] + ldr x13, [x6, #640] + ldr x13, [x6, #704] + ldr x13, [x6, #768] + ldr x13, [x6, #832] + ldr x13, [x6, #896] + ldr x13, [x6, #960] + ldr w7, [x6, x7, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x8, x12, #16, #8 + eor w7, w7, w10, ror 24 + ubfx x10, x11, #56, #8 + eor w7, w7, w19, ror 8 + ubfx x19, x12, #40, #8 + eor w7, w7, w20, ror 16 + ubfx x20, x11, #0, #8 + ldr w8, [x6, x8, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x9, x12, #48, #8 + eor w8, w8, w10, ror 24 + ubfx x10, x12, #24, #8 + eor w8, w8, w19, ror 8 + ubfx x19, x11, #8, #8 + eor w8, w8, w20, ror 16 + ubfx x20, x11, #32, #8 + bfi x7, x8, #32, #32 + ldr w9, [x6, x9, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x13, x12, #0, #8 + eor w9, w9, w10, ror 24 + ubfx x10, x11, #16, #8 + eor w9, w9, w19, ror 8 + ubfx x19, x12, #56, #8 + eor w8, w9, w20, ror 16 + ubfx x20, x11, #40, #8 + ldr w13, [x6, x13, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w20, [x6, x20, LSL 2] + eor w19, w19, w13, ror 24 + ldp x11, x12, [x22], #16 + eor w10, w10, w19, ror 24 + eor w10, w10, w20, ror 8 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + subs w21, w21, #2 + bne L_AES_GCMSIV_ctr_base_loop_nr + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x19, x8, #8, #8 + ubfx x20, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w19, ror 8 + ubfx x19, x8, #40, #8 + eor w11, w11, w20, ror 16 + ubfx x20, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w19, ror 8 + ubfx x19, x7, #8, #8 + eor w12, w12, w20, ror 16 + ubfx x20, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w20, [x6, x20, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w19, ror 8 + ubfx x19, x8, #56, #8 + eor w12, w13, w20, ror 16 + ubfx x20, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w19, [x6, x19, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w20, [x6, x20, LSL 2] + eor w19, w19, w9, ror 24 + ldp x7, x8, [x22], #16 + eor w14, w14, w19, ror 24 + eor w14, w14, w20, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x12, #32, #8 + ubfx x10, x12, #8, #8 + ubfx x19, x11, #48, #8 + ubfx x20, x11, #24, #8 + lsl w7, w7, #2 + lsl w10, w10, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldr x14, [x6] + ldr x14, [x6, #64] + ldr x14, [x6, #128] + ldr x14, [x6, #192] + ldr x14, [x6, #256] + ldr x14, [x6, #320] + ldr x14, [x6, #384] + ldr x14, [x6, #448] + ldr x14, [x6, #512] + ldr x14, [x6, #576] + ldr x14, [x6, #640] + ldr x14, [x6, #704] + ldr x14, [x6, #768] + ldr x14, [x6, #832] + ldr x14, [x6, #896] + ldr x14, [x6, #960] + ldrb w7, [x6, x7, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ldrb w20, [x6, x20, LSL 0] + ubfx x8, x11, #0, #8 + eor w7, w7, w10, lsl 8 + ubfx x10, x12, #40, #8 + eor w7, w7, w19, lsl 16 + ubfx x19, x12, #16, #8 + eor w7, w7, w20, lsl 24 + ubfx x20, x11, #56, #8 + lsl w8, w8, #2 + lsl w10, w10, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w8, [x6, x8, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ldrb w20, [x6, x20, LSL 0] + ubfx x9, x11, #32, #8 + eor w8, w8, w10, lsl 8 + ubfx x10, x11, #8, #8 + eor w8, w8, w19, lsl 16 + ubfx x19, x12, #48, #8 + eor w8, w8, w20, lsl 24 + ubfx x20, x12, #24, #8 + bfi x7, x8, #32, #32 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w9, [x6, x9, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ldrb w20, [x6, x20, LSL 0] + ubfx x14, x12, #56, #8 + eor w9, w9, w10, lsl 8 + ubfx x10, x12, #0, #8 + eor w9, w9, w19, lsl 16 + ubfx x19, x11, #40, #8 + eor w8, w9, w20, lsl 24 + ubfx x20, x11, #16, #8 + lsl w14, w14, #2 + lsl w10, w10, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w14, [x6, x14, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ldrb w20, [x6, x20, LSL 0] + eor w19, w19, w14, lsl 16 + ldp x11, x12, [x22] + eor w10, w10, w19, lsl 8 + eor w10, w10, w20, lsl 16 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + rev32 x7, x7 + rev32 x8, x8 + ldr x11, [x0] + ldr x12, [x0, #8] + eor x7, x7, x11 + eor x8, x8, x12 + str x7, [x1] + str x8, [x1, #8] + add w17, w17, #1 + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_GCMSIV_ctr_base_loop_block +L_AES_GCMSIV_ctr_base_done: + bfi x15, x17, #0, #32 + stp x15, x16, [x5] + ldp x17, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldr x22, [x29, #56] + ldp x29, x30, [sp], #0x40 + ret +#ifndef __APPLE__ + .size AES_GCMSIV_ctr_base,.-AES_GCMSIV_ctr_base +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP */ #endif /* !defined(NO_AES) && defined(WOLFSSL_ARMASM) */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c index 291d3d1214..2e1394a4b9 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c @@ -3540,6 +3540,8 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg, byte* key, byte* tmp, word32* left, word32 nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" "ld1 {v15.2d}, [%x[reg]]\n\t" @@ -5189,6 +5191,7 @@ void AES_CTR_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg, "rev x11, x10\n\t" "rev x12, x9\n\t" "stp x11, x12, [%x[reg]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [reg] "+r" (reg), [key] "+r" (key), [tmp] "+r" (tmp), [left] "+r" (left), [nr] "+r" (nr) : [in] "r" (in) @@ -5261,11 +5264,19 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, word32 aadSz, byte* key, byte* gcm_h, byte* tmp, byte* reg, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %w[nr], [sp, #72]\n\t" + "str %x[reg], [sp, #64]\n\t" + "str %x[tmp], [sp, #56]\n\t" + "str %x[gcm_h], [sp, #48]\n\t" + "str %x[key], [sp, #40]\n\t" + "str %w[aadSz], [sp, #32]\n\t" "movi v27.16b, #0x87\n\t" "eor v26.16b, v26.16b, v26.16b\n\t" "ushr v27.2d, v27.2d, #56\n\t" - "ld1 {v22.2d}, [%x[gcm_h]]\n\t" - "cmp %w[aadSz], #0x40\n\t" + "ld1 {v22.2d}, [x10]\n\t" + "cmp w8, #0x40\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #32\n\t" "csetm x17, lt\n\t" @@ -5280,7 +5291,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v31.1q, v29.2d, v27.2d\n\t" "mov v30.d[1], v29.d[0]\n\t" "eor v23.16b, v30.16b, v31.16b\n\t" - "cmp %w[aadSz], #0x100\n\t" + "cmp w8, #0x100\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x40\n\t" "csetm x17, lt\n\t" @@ -5311,7 +5322,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v30.d[1], v29.d[0]\n\t" "eor v25.16b, v30.16b, v31.16b\n\t" /* Done */ - "cmp %w[aadSz], #0x400\n\t" + "cmp w8, #0x400\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x200\n\t" "csetm x17, lt\n\t" @@ -5368,7 +5379,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done */ "\n" "L_aes_gcm_encrypt_arm64_crypto_h_done_%=:\n\t" - "lsr w14, %w[aadSz], #4\n\t" + "lsr w14, w8, #4\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_aad_start_1_%=\n\t" "cmp w14, #16\n\t" @@ -5606,41 +5617,41 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.ne L_aes_gcm_encrypt_arm64_crypto_aad_both_1_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_aad_done_%=:\n\t" - "and w14, %w[aadSz], #15\n\t" + "and w14, w8, #15\n\t" "cbz w14, L_aes_gcm_encrypt_arm64_crypto_aad_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w14\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_aad_start_dw_%=\n\t" "ldr x19, [%x[aad]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_aad_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_aad_start_sw_%=\n\t" "ldr w19, [%x[aad]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_aad_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=\n\t" "ldrh w19, [%x[aad]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_encrypt_arm64_crypto_aad_end_bytes_%=\n\t" "ldrb w19, [%x[aad]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_aad_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_aad_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v26.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -5705,37 +5716,37 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "cbz x24, L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w24\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_nonce_start_dw_%=\n\t" "ldr x19, [%x[nonce]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_nonce_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_nonce_start_sw_%=\n\t" "ldr w19, [%x[nonce]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_nonce_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=\n\t" "ldrh w19, [%x[nonce]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes_%=\n\t" "ldrb w19, [%x[nonce]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_nonce_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_nonce_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x24\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x24\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v13.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -5757,7 +5768,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_nonce_partial_done_%=:\n\t" "eor x14, x14, x14\n\t" - "ubfiz x24, %x[nonceSz], #3, #32\n\t" + "lsl x24, %x[nonceSz], #3\n\t" "mov v28.d[0], x14\n\t" "mov v28.d[1], x24\n\t" "rev64 v28.16b, v28.16b\n\t" @@ -5781,9 +5792,9 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "rev w15, w15\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_done_nonce_%=:\n\t" - "st1 {v13.2d}, [%x[reg]]\n\t" + "st1 {v13.2d}, [x12]\n\t" "lsr w14, %w[sz], #4\n\t" - "cmp %w[nr], #12\n\t" + "cmp w13, #12\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_start_128_%=\n\t" "b.gt L_aes_gcm_encrypt_arm64_crypto_start_256_%=\n\t" /* AES_GCM_192 */ @@ -5792,7 +5803,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_192_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -5825,7 +5836,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -5842,7 +5853,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -5859,7 +5870,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -5876,7 +5887,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -5893,7 +5904,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -5910,7 +5921,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -5927,7 +5938,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -5945,7 +5956,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -5962,7 +5973,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -5983,7 +5994,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -6004,7 +6015,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v3.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -6021,7 +6032,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -6046,14 +6057,14 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -6094,7 +6105,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -6124,7 +6135,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -6152,7 +6163,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -6181,7 +6192,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -6209,7 +6220,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -6237,7 +6248,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -6267,7 +6278,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -6295,7 +6306,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v29.16b\n\t" @@ -6320,7 +6331,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -6343,7 +6354,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -6360,7 +6371,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -6377,7 +6388,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -6394,7 +6405,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -6506,10 +6517,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_192_start_1_%=\n\t" @@ -7020,37 +7031,37 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_192_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_192_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_192_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -7080,31 +7091,31 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v14.16b, v11.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_192_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_192_out_start_byte_%=\n\t" @@ -7115,10 +7126,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_192_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_192_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -7139,11 +7150,11 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_192_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -7193,30 +7204,30 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_192_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_192_tag_start_byte_%=\n\t" @@ -7232,7 +7243,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_256_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -7265,7 +7276,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7282,7 +7293,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7299,7 +7310,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7316,7 +7327,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7333,7 +7344,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7350,7 +7361,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7367,7 +7378,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7385,7 +7396,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7402,7 +7413,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7423,7 +7434,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7444,7 +7455,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v3.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7461,7 +7472,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7478,7 +7489,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7495,7 +7506,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7520,14 +7531,14 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -7568,7 +7579,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -7598,7 +7609,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -7626,7 +7637,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -7655,7 +7666,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -7683,7 +7694,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -7711,7 +7722,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -7741,7 +7752,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -7769,7 +7780,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v29.16b\n\t" @@ -7794,7 +7805,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -7817,7 +7828,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7834,7 +7845,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7851,7 +7862,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7868,7 +7879,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -7885,7 +7896,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -7902,7 +7913,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -8014,10 +8025,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]], #16\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9], #16\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_256_start_1_%=\n\t" @@ -8140,7 +8151,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8331,7 +8342,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8472,7 +8483,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v11.16b\n\t" "aesmc v15.16b, v15.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8545,10 +8556,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" @@ -8577,37 +8588,37 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_256_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_256_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_256_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -8636,38 +8647,38 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_256_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_256_out_start_byte_%=\n\t" @@ -8678,10 +8689,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_256_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_256_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -8702,11 +8713,11 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_256_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "aese v14.16b, v0.16b\n\t" @@ -8742,19 +8753,19 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull2 v30.1q, v31.2d, v27.2d\n\t" - "ldr q11, [%x[key], #-32]\n\t" + "ldr q11, [x9, #-32]\n\t" "aese v14.16b, v10.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "mov v28.d[1], v31.d[0]\n\t" - "ldr q12, [%x[key], #-16]\n\t" + "ldr q12, [x9, #-16]\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v28.16b, v30.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "rbit v26.16b, v26.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v26.16b, v26.16b, v14.16b\n\t" @@ -8764,30 +8775,30 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_256_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_256_tag_start_byte_%=\n\t" @@ -8803,7 +8814,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_128_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -8836,7 +8847,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8853,7 +8864,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -8870,7 +8881,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8887,7 +8898,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -8904,7 +8915,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8921,7 +8932,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -8938,7 +8949,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8956,7 +8967,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -8973,7 +8984,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -8994,7 +9005,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -9023,14 +9034,14 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -9071,7 +9082,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -9101,7 +9112,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -9129,7 +9140,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -9158,7 +9169,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -9186,7 +9197,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -9214,7 +9225,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -9244,7 +9255,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -9272,7 +9283,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v29.16b\n\t" @@ -9297,7 +9308,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -9320,7 +9331,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -9337,7 +9348,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -9449,10 +9460,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d}, [%x[key]], #32\n\t" - "ld1 {v10.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d}, [x9], #32\n\t" + "ld1 {v10.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_128_start_1_%=\n\t" @@ -9919,37 +9930,37 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_128_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_128_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_128_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -9975,31 +9986,31 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "eor v14.16b, v14.16b, v10.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_128_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_128_out_start_byte_%=\n\t" @@ -10010,10 +10021,10 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_128_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_128_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -10034,11 +10045,11 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_128_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -10084,30 +10095,30 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_128_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_128_tag_start_byte_%=\n\t" @@ -10116,6 +10127,7 @@ void AES_GCM_encrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_gcm_encrypt_arm64_crypto_done_%=:\n\t" + "ldp x29, x30, [sp], #0x50\n\t" : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz), [tag] "+r" (tag), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key), [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), @@ -10136,11 +10148,19 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %w[nr], [sp, #72]\n\t" + "str %x[reg], [sp, #64]\n\t" + "str %x[tmp], [sp, #56]\n\t" + "str %x[gcm_h], [sp, #48]\n\t" + "str %x[key], [sp, #40]\n\t" + "str %w[aadSz], [sp, #32]\n\t" "movi v27.16b, #0x87\n\t" "eor v26.16b, v26.16b, v26.16b\n\t" "ushr v27.2d, v27.2d, #56\n\t" - "ld1 {v22.2d}, [%x[gcm_h]]\n\t" - "cmp %w[aadSz], #0x40\n\t" + "ld1 {v22.2d}, [x10]\n\t" + "cmp w8, #0x40\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #32\n\t" "csetm x17, lt\n\t" @@ -10155,7 +10175,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v31.1q, v29.2d, v27.2d\n\t" "mov v30.d[1], v29.d[0]\n\t" "eor v23.16b, v30.16b, v31.16b\n\t" - "cmp %w[aadSz], #0x100\n\t" + "cmp w8, #0x100\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x40\n\t" "csetm x17, lt\n\t" @@ -10186,7 +10206,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v30.d[1], v29.d[0]\n\t" "eor v25.16b, v30.16b, v31.16b\n\t" /* Done */ - "cmp %w[aadSz], #0x400\n\t" + "cmp w8, #0x400\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x200\n\t" "csetm x17, lt\n\t" @@ -10243,7 +10263,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done */ "\n" "L_aes_gcm_decrypt_arm64_crypto_h_done_%=:\n\t" - "lsr w14, %w[aadSz], #4\n\t" + "lsr w14, w8, #4\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_aad_start_1_%=\n\t" "cmp w14, #16\n\t" @@ -10481,41 +10501,41 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.ne L_aes_gcm_decrypt_arm64_crypto_aad_both_1_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_aad_done_%=:\n\t" - "and w14, %w[aadSz], #15\n\t" + "and w14, w8, #15\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_aad_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w14\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_aad_start_dw_%=\n\t" "ldr x19, [%x[aad]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_aad_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_aad_start_sw_%=\n\t" "ldr w19, [%x[aad]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_aad_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=\n\t" "ldrh w19, [%x[aad]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_decrypt_arm64_crypto_aad_end_bytes_%=\n\t" "ldrb w19, [%x[aad]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_aad_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_aad_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v26.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -10580,37 +10600,37 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "cbz x24, L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w24\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_nonce_start_dw_%=\n\t" "ldr x19, [%x[nonce]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_nonce_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_nonce_start_sw_%=\n\t" "ldr w19, [%x[nonce]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_nonce_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=\n\t" "ldrh w19, [%x[nonce]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes_%=\n\t" "ldrb w19, [%x[nonce]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_nonce_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_nonce_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x24\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x24\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v13.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -10632,7 +10652,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_decrypt_arm64_crypto_nonce_partial_done_%=:\n\t" "eor x14, x14, x14\n\t" - "ubfiz x24, %x[nonceSz], #3, #32\n\t" + "lsl x24, %x[nonceSz], #3\n\t" "mov v28.d[0], x14\n\t" "mov v28.d[1], x24\n\t" "rev64 v28.16b, v28.16b\n\t" @@ -10656,9 +10676,9 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "rev w15, w15\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_done_nonce_%=:\n\t" - "st1 {v13.2d}, [%x[reg]]\n\t" + "st1 {v13.2d}, [x12]\n\t" "lsr w14, %w[sz], #4\n\t" - "cmp %w[nr], #12\n\t" + "cmp w13, #12\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_start_128_%=\n\t" "b.gt L_aes_gcm_decrypt_arm64_crypto_start_256_%=\n\t" /* AES_GCM_192 */ @@ -10667,7 +10687,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_192_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -10700,7 +10720,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10717,7 +10737,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10734,7 +10754,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10751,7 +10771,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10768,7 +10788,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10785,7 +10805,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10802,7 +10822,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10820,7 +10840,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10837,7 +10857,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10858,7 +10878,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10879,7 +10899,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -10896,7 +10916,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -10921,14 +10941,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -10970,7 +10990,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -11000,7 +11020,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -11029,7 +11049,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -11057,7 +11077,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -11085,7 +11105,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -11115,7 +11135,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -11144,7 +11164,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -11171,7 +11191,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v30.16b\n\t" @@ -11196,7 +11216,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -11218,7 +11238,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -11235,7 +11255,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -11252,7 +11272,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -11269,7 +11289,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -11381,10 +11401,10 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_192_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_192_start_1_%=\n\t" @@ -11896,37 +11916,37 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_192_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_192_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_192_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -11975,30 +11995,30 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_192_out_start_byte_%=\n\t" @@ -12006,11 +12026,11 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_192_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -12060,51 +12080,50 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_192_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_192_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_192_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_192_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -12125,7 +12144,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_256_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -12158,7 +12177,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12175,7 +12194,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12192,7 +12211,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12209,7 +12228,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12226,7 +12245,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12243,7 +12262,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12260,7 +12279,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12278,7 +12297,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12295,7 +12314,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12316,7 +12335,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12337,7 +12356,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12354,7 +12373,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12371,7 +12390,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12388,7 +12407,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12413,14 +12432,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -12462,7 +12481,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -12492,7 +12511,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -12521,7 +12540,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -12549,7 +12568,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -12577,7 +12596,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -12607,7 +12626,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -12636,7 +12655,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -12663,7 +12682,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v30.16b\n\t" @@ -12688,7 +12707,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -12710,7 +12729,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12727,7 +12746,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12744,7 +12763,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12761,7 +12780,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -12778,7 +12797,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -12795,7 +12814,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -12907,10 +12926,10 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_256_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]], #16\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9], #16\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_256_start_1_%=\n\t" @@ -13033,7 +13052,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13224,7 +13243,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13365,7 +13384,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v11.16b\n\t" "aesmc v15.16b, v15.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13438,10 +13457,10 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v18.16b}, [%x[in]], #16\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" @@ -13470,37 +13489,37 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_256_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_256_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_256_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -13545,40 +13564,40 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v28.d[1], v31.d[0]\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "eor v26.16b, v28.16b, v30.16b\n\t" /* Done GHASH */ "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_256_out_start_byte_%=\n\t" @@ -13586,11 +13605,11 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_256_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "aese v14.16b, v0.16b\n\t" @@ -13626,19 +13645,19 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull2 v30.1q, v31.2d, v27.2d\n\t" - "ldr q11, [%x[key], #-32]\n\t" + "ldr q11, [x9, #-32]\n\t" "aese v14.16b, v10.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "mov v28.d[1], v31.d[0]\n\t" - "ldr q12, [%x[key], #-16]\n\t" + "ldr q12, [x9, #-16]\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v28.16b, v30.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "rbit v26.16b, v26.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v26.16b, v26.16b, v14.16b\n\t" @@ -13648,51 +13667,50 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_256_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_256_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_256_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_256_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -13713,7 +13731,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_128_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -13746,7 +13764,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13763,7 +13781,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -13780,7 +13798,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13797,7 +13815,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -13814,7 +13832,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13831,7 +13849,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -13848,7 +13866,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13866,7 +13884,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -13883,7 +13901,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -13904,7 +13922,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -13933,14 +13951,14 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -13982,7 +14000,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -14012,7 +14030,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -14041,7 +14059,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v26.16b, v31.16b\n\t" @@ -14069,7 +14087,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v26.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v30.16b, v30.16b, v26.16b\n\t" @@ -14097,7 +14115,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v30.16b, v30.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -14127,7 +14145,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -14156,7 +14174,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -14183,7 +14201,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v31.16b, v31.16b, v30.16b\n\t" @@ -14208,7 +14226,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -14230,7 +14248,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -14247,7 +14265,7 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -14359,10 +14377,10 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_128_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d}, [%x[key]], #32\n\t" - "ld1 {v10.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d}, [x9], #32\n\t" + "ld1 {v10.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_128_start_1_%=\n\t" @@ -14829,37 +14847,37 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_128_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_128_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_128_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -14904,30 +14922,30 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, /* Done GHASH */ "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_128_out_start_byte_%=\n\t" @@ -14935,11 +14953,11 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_128_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -14985,51 +15003,50 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_128_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_128_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_128_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_128_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -15043,11 +15060,12 @@ int AES_GCM_decrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_gcm_decrypt_arm64_crypto_done_%=:\n\t" - : [in] "+r" (in), [out] "+r" (out), [sz] "+r" (sz), - [nonceSz] "+r" (nonceSz), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), - [key] "+r" (key), [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), - [reg] "+r" (reg), [nr] "+r" (nr) - : [nonce] "r" (nonce), [tag] "r" (tag), [aad] "r" (aad) + "ldp x29, x30, [sp], #0x50\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz), + [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key), + [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), [reg] "+r" (reg), + [nr] "+r" (nr) + : [in] "r" (in), [nonce] "r" (nonce), [tag] "r" (tag), [aad] "r" (aad) : "memory", "cc", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", @@ -15064,11 +15082,19 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, word32 aadSz, byte* key, byte* gcm_h, byte* tmp, byte* reg, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %w[nr], [sp, #72]\n\t" + "str %x[reg], [sp, #64]\n\t" + "str %x[tmp], [sp, #56]\n\t" + "str %x[gcm_h], [sp, #48]\n\t" + "str %x[key], [sp, #40]\n\t" + "str %w[aadSz], [sp, #32]\n\t" "movi v27.16b, #0x87\n\t" "eor v26.16b, v26.16b, v26.16b\n\t" "ushr v27.2d, v27.2d, #56\n\t" - "ld1 {v22.2d}, [%x[gcm_h]]\n\t" - "cmp %w[aadSz], #0x40\n\t" + "ld1 {v22.2d}, [x10]\n\t" + "cmp w8, #0x40\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #32\n\t" "csetm x17, lt\n\t" @@ -15083,7 +15109,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v31.1q, v29.2d, v27.2d\n\t" "mov v30.d[1], v29.d[0]\n\t" "eor v23.16b, v30.16b, v31.16b\n\t" - "cmp %w[aadSz], #0x100\n\t" + "cmp w8, #0x100\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x40\n\t" "csetm x17, lt\n\t" @@ -15113,7 +15139,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v30.d[1], v29.d[0]\n\t" "eor v25.16b, v30.16b, v31.16b\n\t" /* Done */ - "cmp %w[aadSz], #0x400\n\t" + "cmp w8, #0x400\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x200\n\t" "csetm x17, lt\n\t" @@ -15168,7 +15194,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_h_done_%=:\n\t" - "lsr w14, %w[aadSz], #4\n\t" + "lsr w14, w8, #4\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_1_%=\n\t" "cmp w14, #16\n\t" @@ -15391,41 +15417,41 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_aad_both_1_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_done_%=:\n\t" - "and w14, %w[aadSz], #15\n\t" + "and w14, w8, #15\n\t" "cbz w14, L_aes_gcm_encrypt_arm64_crypto_eor3_aad_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w14\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_dw_%=\n\t" "ldr x19, [%x[aad]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_sw_%=\n\t" "ldr w19, [%x[aad]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t" "ldrh w19, [%x[aad]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_encrypt_arm64_crypto_eor3_aad_end_bytes_%=\n\t" "ldrb w19, [%x[aad]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_aad_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v26.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -15488,37 +15514,37 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "cbz x24, L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w24\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_dw_%=\n\t" "ldr x19, [%x[nonce]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_sw_%=\n\t" "ldr w19, [%x[nonce]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t" "ldrh w19, [%x[nonce]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes_%=\n\t" "ldrb w19, [%x[nonce]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x24\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x24\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v13.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -15539,7 +15565,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_nonce_partial_done_%=:\n\t" "eor x14, x14, x14\n\t" - "ubfiz x24, %x[nonceSz], #3, #32\n\t" + "lsl x24, %x[nonceSz], #3\n\t" "mov v28.d[0], x14\n\t" "mov v28.d[1], x24\n\t" "rev64 v28.16b, v28.16b\n\t" @@ -15562,9 +15588,9 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "rev w15, w15\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_done_nonce_%=:\n\t" - "st1 {v13.2d}, [%x[reg]]\n\t" + "st1 {v13.2d}, [x12]\n\t" "lsr w14, %w[sz], #4\n\t" - "cmp %w[nr], #12\n\t" + "cmp w13, #12\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_start_128_%=\n\t" "b.gt L_aes_gcm_encrypt_arm64_crypto_eor3_start_256_%=\n\t" /* AES_GCM_192 */ @@ -15573,7 +15599,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -15606,7 +15632,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15623,7 +15649,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15640,7 +15666,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15657,7 +15683,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15674,7 +15700,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15691,7 +15717,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15708,7 +15734,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15726,7 +15752,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15743,7 +15769,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15764,7 +15790,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15785,7 +15811,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v3.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -15802,7 +15828,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -15827,14 +15853,14 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -15875,7 +15901,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -15905,7 +15931,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -15932,7 +15958,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -15960,7 +15986,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -15987,7 +16013,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -16013,7 +16039,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -16042,7 +16068,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -16069,7 +16095,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v31.16b, v31.16b, v29.16b, v30.16b\n\t" @@ -16093,7 +16119,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -16116,7 +16142,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -16133,7 +16159,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -16150,7 +16176,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -16167,7 +16193,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -16271,10 +16297,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_1_%=\n\t" @@ -16774,37 +16800,37 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -16834,31 +16860,31 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v14.16b, v11.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t" @@ -16869,10 +16895,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_192_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -16892,11 +16918,11 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -16945,30 +16971,30 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t" @@ -16984,7 +17010,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -17017,7 +17043,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17034,7 +17060,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17051,7 +17077,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17068,7 +17094,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17085,7 +17111,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17102,7 +17128,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17119,7 +17145,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17137,7 +17163,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17154,7 +17180,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17175,7 +17201,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17196,7 +17222,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v3.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17213,7 +17239,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17230,7 +17256,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17247,7 +17273,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17272,14 +17298,14 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -17320,7 +17346,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -17350,7 +17376,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -17377,7 +17403,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -17405,7 +17431,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -17432,7 +17458,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17458,7 +17484,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -17487,7 +17513,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -17514,7 +17540,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v31.16b, v31.16b, v29.16b, v30.16b\n\t" @@ -17538,7 +17564,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -17561,7 +17587,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17578,7 +17604,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17595,7 +17621,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17612,7 +17638,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -17629,7 +17655,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -17646,7 +17672,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -17750,10 +17776,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]], #16\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9], #16\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_1_%=\n\t" @@ -17876,7 +17902,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18063,7 +18089,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18200,7 +18226,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v11.16b\n\t" "aesmc v15.16b, v15.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18271,10 +18297,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" @@ -18302,37 +18328,37 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -18361,38 +18387,38 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t" @@ -18403,10 +18429,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_256_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -18426,11 +18452,11 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "aese v14.16b, v0.16b\n\t" @@ -18466,18 +18492,18 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "mov v28.d[1], v31.d[0]\n\t" - "ldr q11, [%x[key], #-32]\n\t" + "ldr q11, [x9, #-32]\n\t" "aese v14.16b, v10.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v28.16b, v30.16b\n\t" - "ldr q12, [%x[key], #-16]\n\t" + "ldr q12, [x9, #-16]\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "rbit v26.16b, v26.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v26.16b, v26.16b, v14.16b\n\t" @@ -18487,30 +18513,30 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t" @@ -18526,7 +18552,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_4_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -18559,7 +18585,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18576,7 +18602,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -18593,7 +18619,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18610,7 +18636,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -18627,7 +18653,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18644,7 +18670,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -18661,7 +18687,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18679,7 +18705,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -18696,7 +18722,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18717,7 +18743,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -18746,14 +18772,14 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v1.16b, v1.16b, v9.16b\n\t" "eor v2.16b, v2.16b, v10.16b\n\t" "eor v3.16b, v3.16b, v11.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%x[out]], #0x40\n\t" "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_8_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -18794,7 +18820,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v9.s[3], w19\n\t" "mov v10.s[3], w17\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" @@ -18824,7 +18850,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v2.16b, v2.16b, #8\n\t" @@ -18851,7 +18877,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v1.16b, v1.16b, #8\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v24.1d\n\t" @@ -18879,7 +18905,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v25.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -18906,7 +18932,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -18932,7 +18958,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^7 */ @@ -18961,7 +18987,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v7.2d, v18.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -18988,7 +19014,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v31.16b, v31.16b, v29.16b, v30.16b\n\t" @@ -19012,7 +19038,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v19.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v20.16b}, [%x[in]], #16\n\t" @@ -19035,7 +19061,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -19052,7 +19078,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v18.16b, v18.16b, v14.16b\n\t" "eor v19.16b, v19.16b, v15.16b\n\t" "eor v20.16b, v20.16b, v16.16b\n\t" @@ -19156,10 +19182,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d}, [%x[key]], #32\n\t" - "ld1 {v10.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d}, [x9], #32\n\t" + "ld1 {v10.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_done_%=\n\t" "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_1_%=\n\t" @@ -19615,37 +19641,37 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done_%=\n\t" "eor v16.16b, v16.16b, v16.16b\n\t" "mov w19, w14\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_byte_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v16.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v16.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rev w16, w15\n\t" @@ -19671,31 +19697,31 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "eor v14.16b, v14.16b, v10.16b\n\t" "eor v16.16b, v16.16b, v14.16b\n\t" - "st1 {v16.2d}, [%x[tmp]]\n\t" + "st1 {v16.2d}, [x11]\n\t" "mov w19, w14\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub x19, x19, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub x19, x19, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub x19, x19, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs x19, x19, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t" @@ -19706,10 +19732,10 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero_%=:\n\t" "subs x17, x17, #1\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_128_start_zero_%=\n\t" - "sub %x[tmp], %x[tmp], #16\n\t" - "ld1 {v14.2d}, [%x[tmp]]\n\t" + "sub x11, x11, #16\n\t" + "ld1 {v14.2d}, [x11]\n\t" "rbit v14.16b, v14.16b\n\t" "eor v15.16b, v26.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -19729,11 +19755,11 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -19778,30 +19804,30 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_partial_%=:\n\t" - "st1 {v26.16b}, [%x[tmp]]\n\t" + "st1 {v26.16b}, [x11]\n\t" "cmp %w[tagSz], #8\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_dw_%=\n\t" - "ldr x16, [%x[tmp]], #8\n\t" + "ldr x16, [x11], #8\n\t" "sub %w[tagSz], %w[tagSz], #8\n\t" "str x16, [%x[tag]], #8\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_dw_%=:\n\t" "cmp %w[tagSz], #4\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_sw_%=\n\t" - "ldr w16, [%x[tmp]], #4\n\t" + "ldr w16, [x11], #4\n\t" "sub %w[tagSz], %w[tagSz], #4\n\t" "str w16, [%x[tag]], #4\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_sw_%=:\n\t" "cmp %w[tagSz], #2\n\t" "b.lt L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t" - "ldrh w16, [%x[tmp]], #2\n\t" + "ldrh w16, [x11], #2\n\t" "sub %w[tagSz], %w[tagSz], #2\n\t" "strh w16, [%x[tag]], #2\n\t" "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=:\n\t" "cbz %w[tagSz], L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_end_bytes_%=\n\t" - "ldrb w16, [%x[tmp]], #1\n\t" + "ldrb w16, [x11], #1\n\t" "subs %w[tagSz], %w[tagSz], #1\n\t" "strb w16, [%x[tag]], #1\n\t" "b.ne L_aes_gcm_encrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t" @@ -19810,6 +19836,7 @@ void AES_GCM_encrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_gcm_encrypt_arm64_crypto_eor3_done_%=:\n\t" + "ldp x29, x30, [sp], #0x50\n\t" : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz), [tag] "+r" (tag), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key), [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), @@ -19830,11 +19857,19 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-80]!\n\t" + "add x29, sp, #0\n\t" + "str %w[nr], [sp, #72]\n\t" + "str %x[reg], [sp, #64]\n\t" + "str %x[tmp], [sp, #56]\n\t" + "str %x[gcm_h], [sp, #48]\n\t" + "str %x[key], [sp, #40]\n\t" + "str %w[aadSz], [sp, #32]\n\t" "movi v27.16b, #0x87\n\t" "eor v26.16b, v26.16b, v26.16b\n\t" "ushr v27.2d, v27.2d, #56\n\t" - "ld1 {v22.2d}, [%x[gcm_h]]\n\t" - "cmp %w[aadSz], #0x40\n\t" + "ld1 {v22.2d}, [x10]\n\t" + "cmp w8, #0x40\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #32\n\t" "csetm x17, lt\n\t" @@ -19849,7 +19884,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v31.1q, v29.2d, v27.2d\n\t" "mov v30.d[1], v29.d[0]\n\t" "eor v23.16b, v30.16b, v31.16b\n\t" - "cmp %w[aadSz], #0x100\n\t" + "cmp w8, #0x100\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x40\n\t" "csetm x17, lt\n\t" @@ -19879,7 +19914,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v30.d[1], v29.d[0]\n\t" "eor v25.16b, v30.16b, v31.16b\n\t" /* Done */ - "cmp %w[aadSz], #0x400\n\t" + "cmp w8, #0x400\n\t" "csetm x16, lt\n\t" "cmp %w[sz], #0x200\n\t" "csetm x17, lt\n\t" @@ -19934,7 +19969,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done */ "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_h_done_%=:\n\t" - "lsr w14, %w[aadSz], #4\n\t" + "lsr w14, w8, #4\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_1_%=\n\t" "cmp w14, #16\n\t" @@ -20157,41 +20192,41 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_aad_both_1_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_done_%=:\n\t" - "and w14, %w[aadSz], #15\n\t" + "and w14, w8, #15\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_eor3_aad_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w14\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_dw_%=\n\t" "ldr x19, [%x[aad]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_sw_%=\n\t" "ldr w19, [%x[aad]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t" "ldrh w19, [%x[aad]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_decrypt_arm64_crypto_eor3_aad_end_bytes_%=\n\t" "ldrb w19, [%x[aad]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_aad_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_aad_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v26.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -20254,37 +20289,37 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "cbz x24, L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done_%=\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov w20, w24\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp w20, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_dw_%=\n\t" "ldr x19, [%x[nonce]], #8\n\t" "sub w20, w20, #8\n\t" - "str x19, [%x[tmp]], #8\n\t" + "str x19, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_dw_%=:\n\t" "cmp w20, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_sw_%=\n\t" "ldr w19, [%x[nonce]], #4\n\t" "sub w20, w20, #4\n\t" - "str w19, [%x[tmp]], #4\n\t" + "str w19, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_sw_%=:\n\t" "cmp w20, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t" "ldrh w19, [%x[nonce]], #2\n\t" "sub w20, w20, #2\n\t" - "strh w19, [%x[tmp]], #2\n\t" + "strh w19, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=:\n\t" "cbz w20, L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes_%=\n\t" "ldrb w19, [%x[nonce]], #1\n\t" "subs w20, w20, #1\n\t" - "strb w19, [%x[tmp]], #1\n\t" + "strb w19, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x24\n\t" - "ld1 {v18.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x24\n\t" + "ld1 {v18.2d}, [x11]\n\t" "rbit v18.16b, v18.16b\n\t" "eor v21.16b, v13.16b, v18.16b\n\t" /* X = C * H^1 */ @@ -20305,7 +20340,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_nonce_partial_done_%=:\n\t" "eor x14, x14, x14\n\t" - "ubfiz x24, %x[nonceSz], #3, #32\n\t" + "lsl x24, %x[nonceSz], #3\n\t" "mov v28.d[0], x14\n\t" "mov v28.d[1], x24\n\t" "rev64 v28.16b, v28.16b\n\t" @@ -20328,9 +20363,9 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "rev w15, w15\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_done_nonce_%=:\n\t" - "st1 {v13.2d}, [%x[reg]]\n\t" + "st1 {v13.2d}, [x12]\n\t" "lsr w14, %w[sz], #4\n\t" - "cmp %w[nr], #12\n\t" + "cmp w13, #12\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_start_128_%=\n\t" "b.gt L_aes_gcm_decrypt_arm64_crypto_eor3_start_256_%=\n\t" /* AES_GCM_192 */ @@ -20339,7 +20374,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -20372,7 +20407,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20389,7 +20424,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20406,7 +20441,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20423,7 +20458,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20440,7 +20475,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20457,7 +20492,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20474,7 +20509,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20492,7 +20527,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20509,7 +20544,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20530,7 +20565,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20551,7 +20586,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20568,7 +20603,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20593,14 +20628,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -20642,7 +20677,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -20672,7 +20707,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -20700,7 +20735,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -20727,7 +20762,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20753,7 +20788,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -20782,7 +20817,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -20810,7 +20845,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -20836,7 +20871,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull2 v30.1q, v31.2d, v27.2d\n\t" @@ -20860,7 +20895,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -20882,7 +20917,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20899,7 +20934,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -20916,7 +20951,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -20933,7 +20968,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -21037,10 +21072,10 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_1_%=\n\t" @@ -21541,37 +21576,37 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_192_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -21619,30 +21654,30 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_start_byte_%=\n\t" @@ -21650,11 +21685,11 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_eor3_192_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -21703,51 +21738,50 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_192_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_192_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -21768,7 +21802,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -21801,7 +21835,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21818,7 +21852,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -21835,7 +21869,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21852,7 +21886,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -21869,7 +21903,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21886,7 +21920,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -21903,7 +21937,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21921,7 +21955,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -21938,7 +21972,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21959,7 +21993,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -21980,7 +22014,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -21997,7 +22031,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22014,7 +22048,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -22031,7 +22065,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22056,14 +22090,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -22105,7 +22139,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -22135,7 +22169,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -22163,7 +22197,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -22190,7 +22224,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22216,7 +22250,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -22245,7 +22279,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -22273,7 +22307,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -22299,7 +22333,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull2 v30.1q, v31.2d, v27.2d\n\t" @@ -22323,7 +22357,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -22345,7 +22379,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22362,7 +22396,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #176]\n\t" + "ldr q13, [x9, #176]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -22379,7 +22413,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #192]\n\t" + "ldr q12, [x9, #192]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22396,7 +22430,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #208]\n\t" + "ldr q13, [x9, #208]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -22413,7 +22447,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #224]\n\t" + "ldr q12, [x9, #224]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -22430,7 +22464,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -22534,10 +22568,10 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[key]], #0x40\n\t" - "ld1 {v12.2d}, [%x[key]], #16\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x9], #0x40\n\t" + "ld1 {v12.2d}, [x9], #16\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_1_%=\n\t" @@ -22660,7 +22694,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -22847,7 +22881,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v16.16b, v16.16b\n\t" "aese v17.16b, v11.16b\n\t" "aesmc v17.16b, v17.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -22984,7 +23018,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v11.16b\n\t" "aesmc v15.16b, v15.16b\n\t" - "ld1 {v29.2d, v30.2d}, [%x[key]]\n\t" + "ld1 {v29.2d, v30.2d}, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23055,10 +23089,10 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v18.16b}, [%x[in]], #16\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" @@ -23086,37 +23120,37 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_256_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -23161,39 +23195,39 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v26.16b, v28.16b, v30.16b\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" /* Done GHASH */ "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_start_byte_%=\n\t" @@ -23201,11 +23235,11 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_eor3_256_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "aese v14.16b, v0.16b\n\t" @@ -23241,18 +23275,18 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v14.16b, v9.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "mov v28.d[1], v31.d[0]\n\t" - "ldr q11, [%x[key], #-32]\n\t" + "ldr q11, [x9, #-32]\n\t" "aese v14.16b, v10.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v26.16b, v28.16b, v30.16b\n\t" - "ldr q12, [%x[key], #-16]\n\t" + "ldr q12, [x9, #-16]\n\t" "aese v14.16b, v11.16b\n\t" "aesmc v14.16b, v14.16b\n\t" - "ldr q29, [%x[key]]\n\t" + "ldr q29, [x9]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "rbit v26.16b, v26.16b\n\t" - "ldr q30, [%x[key], #16]\n\t" + "ldr q30, [x9, #16]\n\t" "aese v14.16b, v29.16b\n\t" "eor v14.16b, v14.16b, v30.16b\n\t" "eor v26.16b, v26.16b, v14.16b\n\t" @@ -23262,51 +23296,50 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_256_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_256_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -23327,7 +23360,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_4_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -23360,7 +23393,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "mov v10.s[3], w17\n\t" "rev w16, w15\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23377,7 +23410,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23394,7 +23427,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23411,7 +23444,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23428,7 +23461,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23445,7 +23478,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23462,7 +23495,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23480,7 +23513,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23497,7 +23530,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v12.16b\n\t" @@ -23518,7 +23551,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23547,14 +23580,14 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v9.16b, v9.16b, v1.16b\n\t" "eor v10.16b, v10.16b, v2.16b\n\t" "eor v11.16b, v11.16b, v3.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[out]], #0x40\n\t" "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%x[out]], #0x40\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_8_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_both_8_%=:\n\t" - "ldr q12, [%x[key]]\n\t" + "ldr q12, [x9]\n\t" "add w24, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "add w23, w15, #2\n\t" @@ -23596,7 +23629,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "rev w16, w15\n\t" "eor v18.16b, v18.16b, v26.16b\n\t" "mov v11.s[3], w16\n\t" - "ldr q13, [%x[key], #16]\n\t" + "ldr q13, [x9, #16]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X = C * H^1 */ @@ -23626,7 +23659,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ext v26.16b, v2.16b, v2.16b, #8\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #32]\n\t" + "ldr q12, [x9, #32]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull v31.1q, v26.1d, v23.1d\n\t" @@ -23654,7 +23687,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v26.2d, v24.2d\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #48]\n\t" + "ldr q13, [x9, #48]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" @@ -23681,7 +23714,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor3 v30.16b, v30.16b, v26.16b, v31.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #64]\n\t" + "ldr q12, [x9, #64]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23707,7 +23740,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #80]\n\t" + "ldr q13, [x9, #80]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" /* X += C * H^6 */ @@ -23736,7 +23769,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "pmull2 v26.1q, v6.2d, v19.2d\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #96]\n\t" + "ldr q12, [x9, #96]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "eor v28.16b, v28.16b, v31.16b\n\t" @@ -23764,7 +23797,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v29.16b, v29.16b, v26.16b\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #112]\n\t" + "ldr q13, [x9, #112]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ext v26.16b, v18.16b, v18.16b, #8\n\t" @@ -23790,7 +23823,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" "subs w14, w14, #8\n\t" - "ldr q12, [%x[key], #128]\n\t" + "ldr q12, [x9, #128]\n\t" "aese v14.16b, v13.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "pmull2 v30.1q, v31.2d, v27.2d\n\t" @@ -23814,7 +23847,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "ld1 {v20.16b}, [%x[in]], #16\n\t" "aese v11.16b, v13.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q13, [%x[key], #144]\n\t" + "ldr q13, [x9, #144]\n\t" "aese v14.16b, v12.16b\n\t" "aesmc v14.16b, v14.16b\n\t" "ld1 {v21.16b}, [%x[in]], #16\n\t" @@ -23836,7 +23869,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "aesmc v10.16b, v10.16b\n\t" "aese v11.16b, v12.16b\n\t" "aesmc v11.16b, v11.16b\n\t" - "ldr q12, [%x[key], #160]\n\t" + "ldr q12, [x9, #160]\n\t" "aese v14.16b, v13.16b\n\t" "eor v14.16b, v14.16b, v12.16b\n\t" "aese v15.16b, v13.16b\n\t" @@ -23853,7 +23886,7 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "eor v10.16b, v10.16b, v12.16b\n\t" "aese v11.16b, v13.16b\n\t" "eor v11.16b, v11.16b, v12.16b\n\t" - "ld1 {v13.2d}, [%x[reg]]\n\t" + "ld1 {v13.2d}, [x12]\n\t" "eor v14.16b, v14.16b, v18.16b\n\t" "eor v15.16b, v15.16b, v19.16b\n\t" "eor v16.16b, v16.16b, v20.16b\n\t" @@ -23957,10 +23990,10 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_4_%=:\n\t" - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[key]], #0x40\n\t" - "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[key]], #0x40\n\t" - "ld1 {v8.2d, v9.2d}, [%x[key]], #32\n\t" - "ld1 {v10.2d}, [%x[key]]\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x9], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x9], #0x40\n\t" + "ld1 {v8.2d, v9.2d}, [x9], #32\n\t" + "ld1 {v10.2d}, [x9]\n\t" "cmp w14, #1\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_done_%=\n\t" "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_1_%=\n\t" @@ -24416,37 +24449,37 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b.eq L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done_%=\n\t" "eor v15.16b, v15.16b, v15.16b\n\t" "mov w19, w14\n\t" - "st1 {v15.2d}, [%x[tmp]]\n\t" + "st1 {v15.2d}, [x11]\n\t" "cmp x19, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_dw_%=\n\t" "ldr x17, [%x[in]], #8\n\t" "sub x19, x19, #8\n\t" - "str x17, [%x[tmp]], #8\n\t" + "str x17, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_dw_%=:\n\t" "cmp x19, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_sw_%=\n\t" "ldr w17, [%x[in]], #4\n\t" "sub x19, x19, #4\n\t" - "str w17, [%x[tmp]], #4\n\t" + "str w17, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_sw_%=:\n\t" "cmp x19, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=\n\t" "ldrh w17, [%x[in]], #2\n\t" "sub x19, x19, #2\n\t" - "strh w17, [%x[tmp]], #2\n\t" + "strh w17, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=:\n\t" "cbz x19, L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_bytes_%=\n\t" "ldrb w17, [%x[in]], #1\n\t" "subs x19, x19, #1\n\t" - "strb w17, [%x[tmp]], #1\n\t" + "strb w17, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_128_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], x14\n\t" - "ld1 {v15.2d}, [%x[tmp]]\n\t" + "sub x11, x11, x14\n\t" + "ld1 {v15.2d}, [x11]\n\t" "add w15, w15, #1\n\t" "mov v14.16b, v13.16b\n\t" "rbit v15.16b, v15.16b\n\t" @@ -24490,30 +24523,30 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, /* Done GHASH */ "rbit v15.16b, v15.16b\n\t" "eor v14.16b, v14.16b, v15.16b\n\t" - "st1 {v14.2d}, [%x[tmp]]\n\t" + "st1 {v14.2d}, [x11]\n\t" "cmp w14, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_dw_%=\n\t" - "ldr x17, [%x[tmp]], #8\n\t" + "ldr x17, [x11], #8\n\t" "sub w14, w14, #8\n\t" "str x17, [%x[out]], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_dw_%=:\n\t" "cmp w14, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_sw_%=\n\t" - "ldr w17, [%x[tmp]], #4\n\t" + "ldr w17, [x11], #4\n\t" "sub w14, w14, #4\n\t" "str w17, [%x[out]], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_sw_%=:\n\t" "cmp w14, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t" - "ldrh w17, [%x[tmp]], #2\n\t" + "ldrh w17, [x11], #2\n\t" "sub w14, w14, #2\n\t" "strh w17, [%x[out]], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=:\n\t" "cbz w14, L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes_%=\n\t" - "ldrb w17, [%x[tmp]], #1\n\t" + "ldrb w17, [x11], #1\n\t" "subs w14, w14, #1\n\t" "strb w17, [%x[out]], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_start_byte_%=\n\t" @@ -24521,11 +24554,11 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "L_aes_gcm_decrypt_arm64_crypto_eor3_128_out_end_bytes_%=:\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_partial_done_%=:\n\t" - "ld1 {v14.2d}, [%x[reg]]\n\t" - "ubfiz %x[aadSz], %x[aadSz], #3, #32\n\t" - "rbit %x[aadSz], %x[aadSz]\n\t" - "mov v28.d[0], %x[aadSz]\n\t" - "ubfiz %x[sz], %x[sz], #3, #32\n\t" + "ld1 {v14.2d}, [x12]\n\t" + "lsl x8, x8, #3\n\t" + "rbit x8, x8\n\t" + "mov v28.d[0], x8\n\t" + "lsl %x[sz], %x[sz], #3\n\t" "rbit %x[sz], %x[sz]\n\t" "mov v28.d[1], %x[sz]\n\t" "eor v26.16b, v26.16b, v28.16b\n\t" @@ -24570,51 +24603,50 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, "b L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_part_tag_%=:\n\t" - "ubfiz %x[tagSz], %x[tagSz], #0, #32\n\t" "eor v28.16b, v28.16b, v28.16b\n\t" "mov x17, %x[tagSz]\n\t" - "st1 {v28.2d}, [%x[tmp]]\n\t" + "st1 {v28.2d}, [x11]\n\t" "cmp x17, #8\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_dw_%=\n\t" "ldr x16, [%x[tag]], #8\n\t" "sub x17, x17, #8\n\t" - "str x16, [%x[tmp]], #8\n\t" + "str x16, [x11], #8\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_dw_%=:\n\t" "cmp x17, #4\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_sw_%=\n\t" "ldr w16, [%x[tag]], #4\n\t" "sub x17, x17, #4\n\t" - "str w16, [%x[tmp]], #4\n\t" + "str w16, [x11], #4\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_sw_%=:\n\t" "cmp x17, #2\n\t" "b.lt L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t" "ldrh w16, [%x[tag]], #2\n\t" "sub x17, x17, #2\n\t" - "strh w16, [%x[tmp]], #2\n\t" + "strh w16, [x11], #2\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=:\n\t" "cbz x17, L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_end_bytes_%=\n\t" "ldrb w16, [%x[tag]], #1\n\t" "subs x17, x17, #1\n\t" - "strb w16, [%x[tmp]], #1\n\t" + "strb w16, [x11], #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_start_byte_%=\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_end_bytes_%=:\n\t" - "sub %x[tmp], %x[tmp], %x[tagSz]\n\t" - "ld1 {v28.2d}, [%x[tmp]]\n\t" + "sub x11, x11, %x[tagSz]\n\t" + "ld1 {v28.2d}, [x11]\n\t" "mov x17, #16\n\t" - "st1 {v26.2d}, [%x[tmp]]\n\t" + "st1 {v26.2d}, [x11]\n\t" "sub x17, x17, %x[tagSz]\n\t" - "add %x[tmp], %x[tmp], %x[tagSz]\n\t" + "add x11, x11, %x[tagSz]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_calc_tag_byte_%=:\n\t" - "strb wzr, [%x[tmp]], #1\n\t" + "strb wzr, [x11], #1\n\t" "subs x17, x17, #1\n\t" "b.ne L_aes_gcm_decrypt_arm64_crypto_eor3_128_calc_tag_byte_%=\n\t" - "subs %x[tmp], %x[tmp], #16\n\t" - "ld1 {v26.2d}, [%x[tmp]]\n\t" + "subs x11, x11, #16\n\t" + "ld1 {v26.2d}, [x11]\n\t" "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_128_tag_loaded_%=:\n\t" "eor v28.16b, v28.16b, v26.16b\n\t" @@ -24628,11 +24660,12 @@ int AES_GCM_decrypt_AARCH64_EOR3(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_gcm_decrypt_arm64_crypto_eor3_done_%=:\n\t" - : [in] "+r" (in), [out] "+r" (out), [sz] "+r" (sz), - [nonceSz] "+r" (nonceSz), [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), - [key] "+r" (key), [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), - [reg] "+r" (reg), [nr] "+r" (nr) - : [nonce] "r" (nonce), [tag] "r" (tag), [aad] "r" (aad) + "ldp x29, x30, [sp], #0x50\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [nonceSz] "+r" (nonceSz), + [tagSz] "+r" (tagSz), [aadSz] "+r" (aadSz), [key] "+r" (key), + [gcm_h] "+r" (gcm_h), [tmp] "+r" (tmp), [reg] "+r" (reg), + [nr] "+r" (nr) + : [in] "r" (in), [nonce] "r" (nonce), [tag] "r" (tag), [aad] "r" (aad) : "memory", "cc", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", @@ -24748,7 +24781,7 @@ void AES_GCM_init_AARCH64(byte* key, int nr, const byte* nonce, word32 nonceSz, "\n" "L_aes_gcm_init_arm64_crypto_partial_done_%=:\n\t" "eor x7, x7, x7\n\t" - "ubfiz x13, %x[nonceSz], #3, #32\n\t" + "lsl x13, %x[nonceSz], #3\n\t" "mov v7.d[0], x7\n\t" "mov v7.d[1], x13\n\t" "rev64 v7.16b, v7.16b\n\t" @@ -25266,6 +25299,8 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out, const byte* in, word32 nbytes, byte* tag, byte* h, byte* counter) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v13.2d}, [%x[counter]]\n\t" "movi v27.16b, #0x87\n\t" "ld1 {v26.2d}, [%x[tag]]\n\t" @@ -29078,6 +29113,7 @@ void AES_GCM_encrypt_update_AARCH64(const byte* key, int nr, byte* out, "mov v13.s[3], w9\n\t" "st1 {v26.2d}, [%x[tag]]\n\t" "st1 {v13.2d}, [%x[counter]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [nr] "+r" (nr), [out] "+r" (out), [nbytes] "+r" (nbytes), [tag] "+r" (tag), [h] "+r" (h), [counter] "+r" (counter) : [key] "r" (key), [in] "r" (in) @@ -29098,10 +29134,10 @@ void AES_GCM_encrypt_final_AARCH64(byte* tag, byte* authTag, word32 tbytes, "ld1 {v4.2d}, [%x[h]]\n\t" "ushr v6.2d, v6.2d, #56\n\t" "ld1 {v7.2d}, [%x[initCtr]]\n\t" - "ubfiz %x[abytes], %x[abytes], #3, #32\n\t" + "lsl %x[abytes], %x[abytes], #3\n\t" "rbit %x[abytes], %x[abytes]\n\t" "mov v0.d[0], %x[abytes]\n\t" - "ubfiz %x[nbytes], %x[nbytes], #3, #32\n\t" + "lsl %x[nbytes], %x[nbytes], #3\n\t" "rbit %x[nbytes], %x[nbytes]\n\t" "mov v0.d[1], %x[nbytes]\n\t" "eor v5.16b, v5.16b, v0.16b\n\t" @@ -29171,6 +29207,8 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out, const byte* in, word32 nbytes, byte* tag, byte* h, byte* counter) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v13.2d}, [%x[counter]]\n\t" "movi v27.16b, #0x87\n\t" "ld1 {v26.2d}, [%x[tag]]\n\t" @@ -32984,6 +33022,7 @@ void AES_GCM_decrypt_update_AARCH64(const byte* key, int nr, byte* out, "mov v13.s[3], w9\n\t" "st1 {v26.2d}, [%x[tag]]\n\t" "st1 {v13.2d}, [%x[counter]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [nr] "+r" (nr), [out] "+r" (out), [nbytes] "+r" (nbytes), [tag] "+r" (tag), [h] "+r" (h), [counter] "+r" (counter) : [key] "r" (key), [in] "r" (in) @@ -33000,15 +33039,17 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag, int* res) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v5.2d}, [%x[tag]]\n\t" "movi v6.16b, #0x87\n\t" "ld1 {v4.2d}, [%x[h]]\n\t" "ushr v6.2d, v6.2d, #56\n\t" "ld1 {v7.2d}, [%x[initCtr]]\n\t" - "ubfiz %x[abytes], %x[abytes], #3, #32\n\t" + "lsl %x[abytes], %x[abytes], #3\n\t" "rbit %x[abytes], %x[abytes]\n\t" "mov v0.d[0], %x[abytes]\n\t" - "ubfiz %x[nbytes], %x[nbytes], #3, #32\n\t" + "lsl %x[nbytes], %x[nbytes], #3\n\t" "rbit %x[nbytes], %x[nbytes]\n\t" "mov v0.d[1], %x[nbytes]\n\t" "eor v5.16b, v5.16b, v0.16b\n\t" @@ -33034,7 +33075,6 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag, "b L_aes_gcm_decrypt_final_arm64_crypto_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_final_arm64_crypto_part_tag_%=:\n\t" - "ubfiz %x[tbytes], %x[tbytes], #0, #32\n\t" "eor v0.16b, v0.16b, v0.16b\n\t" "mov x10, %x[tbytes]\n\t" "st1 {v0.2d}, [%x[tag]]\n\t" @@ -33091,6 +33131,7 @@ void AES_GCM_decrypt_final_AARCH64(byte* tag, const byte* authTag, "and x8, x8, x11\n\t" "add w8, w8, #0xb4\n\t" "str w8, [%x[res]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [tag] "+r" (tag), [tbytes] "+r" (tbytes), [nbytes] "+r" (nbytes), [abytes] "+r" (abytes), [h] "+r" (h), [initCtr] "+r" (initCtr), [res] "+r" (res) @@ -33202,7 +33243,7 @@ void AES_GCM_init_AARCH64_EOR3(byte* key, int nr, const byte* nonce, "\n" "L_aes_gcm_init_arm64_crypto_eor3_partial_done_%=:\n\t" "eor x7, x7, x7\n\t" - "ubfiz x13, %x[nonceSz], #3, #32\n\t" + "lsl x13, %x[nonceSz], #3\n\t" "mov v7.d[0], x7\n\t" "mov v7.d[1], x13\n\t" "rev64 v7.16b, v7.16b\n\t" @@ -33700,6 +33741,8 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out, const byte* in, word32 nbytes, byte* tag, byte* h, byte* counter) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v13.2d}, [%x[counter]]\n\t" "movi v27.16b, #0x87\n\t" "ld1 {v26.2d}, [%x[tag]]\n\t" @@ -37428,6 +37471,7 @@ void AES_GCM_encrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out, "mov v13.s[3], w9\n\t" "st1 {v26.2d}, [%x[tag]]\n\t" "st1 {v13.2d}, [%x[counter]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [nr] "+r" (nr), [out] "+r" (out), [nbytes] "+r" (nbytes), [tag] "+r" (tag), [h] "+r" (h), [counter] "+r" (counter) : [key] "r" (key), [in] "r" (in) @@ -37448,10 +37492,10 @@ void AES_GCM_encrypt_final_AARCH64_EOR3(byte* tag, byte* authTag, word32 tbytes, "ld1 {v4.2d}, [%x[h]]\n\t" "ushr v6.2d, v6.2d, #56\n\t" "ld1 {v7.2d}, [%x[initCtr]]\n\t" - "ubfiz %x[abytes], %x[abytes], #3, #32\n\t" + "lsl %x[abytes], %x[abytes], #3\n\t" "rbit %x[abytes], %x[abytes]\n\t" "mov v0.d[0], %x[abytes]\n\t" - "ubfiz %x[nbytes], %x[nbytes], #3, #32\n\t" + "lsl %x[nbytes], %x[nbytes], #3\n\t" "rbit %x[nbytes], %x[nbytes]\n\t" "mov v0.d[1], %x[nbytes]\n\t" "eor v5.16b, v5.16b, v0.16b\n\t" @@ -37520,6 +37564,8 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out, const byte* in, word32 nbytes, byte* tag, byte* h, byte* counter) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v13.2d}, [%x[counter]]\n\t" "movi v27.16b, #0x87\n\t" "ld1 {v26.2d}, [%x[tag]]\n\t" @@ -41249,6 +41295,7 @@ void AES_GCM_decrypt_update_AARCH64_EOR3(const byte* key, int nr, byte* out, "mov v13.s[3], w9\n\t" "st1 {v26.2d}, [%x[tag]]\n\t" "st1 {v13.2d}, [%x[counter]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [nr] "+r" (nr), [out] "+r" (out), [nbytes] "+r" (nbytes), [tag] "+r" (tag), [h] "+r" (h), [counter] "+r" (counter) : [key] "r" (key), [in] "r" (in) @@ -41265,15 +41312,17 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag, int* res) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v5.2d}, [%x[tag]]\n\t" "movi v6.16b, #0x87\n\t" "ld1 {v4.2d}, [%x[h]]\n\t" "ushr v6.2d, v6.2d, #56\n\t" "ld1 {v7.2d}, [%x[initCtr]]\n\t" - "ubfiz %x[abytes], %x[abytes], #3, #32\n\t" + "lsl %x[abytes], %x[abytes], #3\n\t" "rbit %x[abytes], %x[abytes]\n\t" "mov v0.d[0], %x[abytes]\n\t" - "ubfiz %x[nbytes], %x[nbytes], #3, #32\n\t" + "lsl %x[nbytes], %x[nbytes], #3\n\t" "rbit %x[nbytes], %x[nbytes]\n\t" "mov v0.d[1], %x[nbytes]\n\t" "eor v5.16b, v5.16b, v0.16b\n\t" @@ -41298,7 +41347,6 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag, "b L_aes_gcm_decrypt_final_arm64_crypto_eor3_tag_loaded_%=\n\t" "\n" "L_aes_gcm_decrypt_final_arm64_crypto_eor3_part_tag_%=:\n\t" - "ubfiz %x[tbytes], %x[tbytes], #0, #32\n\t" "eor v0.16b, v0.16b, v0.16b\n\t" "mov x10, %x[tbytes]\n\t" "st1 {v0.2d}, [%x[tag]]\n\t" @@ -41355,6 +41403,7 @@ void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag, "and x8, x8, x11\n\t" "add w8, w8, #0xb4\n\t" "str w8, [%x[res]]\n\t" + "ldp x29, x30, [sp], #32\n\t" : [tag] "+r" (tag), [tbytes] "+r" (tbytes), [nbytes] "+r" (nbytes), [abytes] "+r" (abytes), [h] "+r" (h), [initCtr] "+r" (initCtr), [res] "+r" (res) @@ -41372,6 +41421,8 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [%x[key2]], #0x40\n\t" "ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [%x[key2]], #0x40\n\t" "ld1 {v4.16b}, [%x[i]]\n\t" @@ -42381,6 +42432,7 @@ void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_xts_encrypt_arm64_crypto_done_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) : [in] "r" (in), [i] "r" (i) @@ -42396,6 +42448,8 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr) { __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [%x[key2]], #0x40\n\t" "ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [%x[key2]], #0x40\n\t" "ld1 {v4.16b}, [%x[i]]\n\t" @@ -43500,6 +43554,7 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* !NO_AES_128 */ "\n" "L_aes_xts_decrypt_arm64_crypto_done_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) : [in] "r" (in), [i] "r" (i) @@ -43512,223 +43567,1282 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ -#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ -#ifndef WOLFSSL_ARMASM_NO_NEON -#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ - defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ - defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) -XALIGNED(4) static const word8 L_AES_ARM64_NEON_te[] = { - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, - 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, - 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, - 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, - 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, - 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, - 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, - 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, - 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, - 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, - 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, - 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, - 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, - 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, - 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, - 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, - 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, -}; - -XALIGNED(4) static const word8 L_AES_ARM64_NEON_shift_rows_shuffle[] = { - 0x0c, 0x09, 0x06, 0x03, 0x00, 0x0d, 0x0a, 0x07, - 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, -}; - -#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || - * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ -#ifdef HAVE_AES_DECRYPT -void AES_invert_key_NEON(unsigned char* ks, word32 rounds); -void AES_invert_key_NEON(unsigned char* ks, word32 rounds) +#ifdef WOLFSSL_AESGCM_SIV +void AES_GCMSIV_polyval_pmull(unsigned char* s, const unsigned char* h, + const unsigned char* data, unsigned int blocks) { __asm__ __volatile__ ( - "add x3, %x[ks], %x[rounds], lsl 4\n\t" - "mov x2, %x[ks]\n\t" - "mov w4, %w[rounds]\n\t" - "\n" - "L_AES_invert_key_NEON_loop_%=:\n\t" - "ld1 {v0.2d}, [x2]\n\t" - "ld1 {v1.2d}, [x3]\n\t" - "st1 {v0.2d}, [x3]\n\t" - "st1 {v1.2d}, [x2], #16\n\t" - "subs w4, w4, #2\n\t" - "sub x3, x3, #16\n\t" - "b.ne L_AES_invert_key_NEON_loop_%=\n\t" - "movi v2.16b, #27\n\t" - "add x2, %x[ks], #16\n\t" - "sub w4, %w[rounds], #1\n\t" - "\n" - "L_AES_invert_key_NEON_mix_loop_%=:\n\t" - "ld1 {v0.2d}, [x2]\n\t" - "sshr v5.16b, v0.16b, #7\n\t" - "ushr v6.16b, v0.16b, #6\n\t" - "ushr v3.16b, v0.16b, #5\n\t" - "and v5.16b, v5.16b, v2.16b\n\t" - "pmul v6.16b, v6.16b, v2.16b\n\t" - "pmul v3.16b, v3.16b, v2.16b\n\t" - "shl v4.16b, v0.16b, #1\n\t" - "eor v5.16b, v5.16b, v4.16b\n\t" - "shl v4.16b, v0.16b, #3\n\t" - "eor v3.16b, v3.16b, v4.16b\n\t" - "shl v4.16b, v0.16b, #2\n\t" - "eor v6.16b, v6.16b, v4.16b\n\t" - "eor v4.16b, v5.16b, v3.16b\n\t" - "eor v3.16b, v3.16b, v0.16b\n\t" - "eor v5.16b, v6.16b, v3.16b\n\t" - "eor v6.16b, v6.16b, v4.16b\n\t" - "eor v4.16b, v4.16b, v0.16b\n\t" - "shl v0.4s, v4.4s, #8\n\t" - "rev32 v5.8h, v5.8h\n\t" - "sri v0.4s, v4.4s, #24\n\t" - "eor v0.16b, v0.16b, v6.16b\n\t" - "shl v4.4s, v3.4s, #24\n\t" - "eor v0.16b, v0.16b, v5.16b\n\t" - "sri v4.4s, v3.4s, #8\n\t" - "eor v0.16b, v0.16b, v4.16b\n\t" - "st1 {v0.2d}, [x2], #16\n\t" - "subs w4, w4, #1\n\t" - "b.ne L_AES_invert_key_NEON_mix_loop_%=\n\t" - : [ks] "+r" (ks), [rounds] "+r" (rounds) - : - : "memory", "cc", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", - "v6" + "movi v2.16b, #0\n\t" + "ld1 {v1.16b}, [%x[h]]\n\t" + "ld1 {v0.16b}, [%x[s]]\n\t" + "rev64 v0.16b, v0.16b\n\t" + "ext v0.16b, v0.16b, v0.16b, #8\n\t" + "pmull v18.1q, v1.1d, v1.1d\n\t" + "pmull2 v19.1q, v1.2d, v1.2d\n\t" + "ext v22.16b, v1.16b, v1.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v3.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v3.1d, v1.1d\n\t" + "pmull2 v19.1q, v3.2d, v1.2d\n\t" + "ext v22.16b, v3.16b, v3.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v4.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v3.1d, v3.1d\n\t" + "pmull2 v19.1q, v3.2d, v3.2d\n\t" + "ext v22.16b, v3.16b, v3.16b, #8\n\t" + "pmull v23.1q, v22.1d, v3.1d\n\t" + "pmull2 v22.1q, v22.2d, v3.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v5.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v5.1d, v1.1d\n\t" + "pmull2 v19.1q, v5.2d, v1.2d\n\t" + "ext v22.16b, v5.16b, v5.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v6.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v5.1d, v3.1d\n\t" + "pmull2 v19.1q, v5.2d, v3.2d\n\t" + "ext v22.16b, v5.16b, v5.16b, #8\n\t" + "pmull v23.1q, v22.1d, v3.1d\n\t" + "pmull2 v22.1q, v22.2d, v3.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v7.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v5.1d, v4.1d\n\t" + "pmull2 v19.1q, v5.2d, v4.2d\n\t" + "ext v22.16b, v5.16b, v5.16b, #8\n\t" + "pmull v23.1q, v22.1d, v4.1d\n\t" + "pmull2 v22.1q, v22.2d, v4.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v8.16b, v19.16b, v18.16b\n\t" + "pmull v18.1q, v5.1d, v5.1d\n\t" + "pmull2 v19.1q, v5.2d, v5.2d\n\t" + "ext v22.16b, v5.16b, v5.16b, #8\n\t" + "pmull v23.1q, v22.1d, v5.1d\n\t" + "pmull2 v22.1q, v22.2d, v5.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v9.16b, v19.16b, v18.16b\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_loop8_%=:\n\t" + "cmp %w[blocks], #8\n\t" + "b.lt L_AES_GCMSIV_polyval_pmull_done8_%=\n\t" + "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%x[data]], #0x40\n\t" + "ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%x[data]], #0x40\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "pmull v18.1q, v10.1d, v9.1d\n\t" + "pmull2 v19.1q, v10.2d, v9.2d\n\t" + "ext v22.16b, v10.16b, v10.16b, #8\n\t" + "pmull v23.1q, v22.1d, v9.1d\n\t" + "pmull2 v22.1q, v22.2d, v9.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "pmull v20.1q, v11.1d, v8.1d\n\t" + "pmull2 v21.1q, v11.2d, v8.2d\n\t" + "ext v22.16b, v11.16b, v11.16b, #8\n\t" + "pmull v23.1q, v22.1d, v8.1d\n\t" + "pmull2 v22.1q, v22.2d, v8.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v12.1d, v7.1d\n\t" + "pmull2 v21.1q, v12.2d, v7.2d\n\t" + "ext v22.16b, v12.16b, v12.16b, #8\n\t" + "pmull v23.1q, v22.1d, v7.1d\n\t" + "pmull2 v22.1q, v22.2d, v7.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v13.1d, v6.1d\n\t" + "pmull2 v21.1q, v13.2d, v6.2d\n\t" + "ext v22.16b, v13.16b, v13.16b, #8\n\t" + "pmull v23.1q, v22.1d, v6.1d\n\t" + "pmull2 v22.1q, v22.2d, v6.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v14.1d, v5.1d\n\t" + "pmull2 v21.1q, v14.2d, v5.2d\n\t" + "ext v22.16b, v14.16b, v14.16b, #8\n\t" + "pmull v23.1q, v22.1d, v5.1d\n\t" + "pmull2 v22.1q, v22.2d, v5.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v15.1d, v4.1d\n\t" + "pmull2 v21.1q, v15.2d, v4.2d\n\t" + "ext v22.16b, v15.16b, v15.16b, #8\n\t" + "pmull v23.1q, v22.1d, v4.1d\n\t" + "pmull2 v22.1q, v22.2d, v4.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v16.1d, v3.1d\n\t" + "pmull2 v21.1q, v16.2d, v3.2d\n\t" + "ext v22.16b, v16.16b, v16.16b, #8\n\t" + "pmull v23.1q, v22.1d, v3.1d\n\t" + "pmull2 v22.1q, v22.2d, v3.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v17.1d, v1.1d\n\t" + "pmull2 v21.1q, v17.2d, v1.2d\n\t" + "ext v22.16b, v17.16b, v17.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v0.16b, v19.16b, v18.16b\n\t" + "sub %w[blocks], %w[blocks], #8\n\t" + "b L_AES_GCMSIV_polyval_pmull_loop8_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_done8_%=:\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_loop4_%=:\n\t" + "cmp %w[blocks], #4\n\t" + "b.lt L_AES_GCMSIV_polyval_pmull_done4_%=\n\t" + "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%x[data]], #0x40\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "pmull v18.1q, v10.1d, v5.1d\n\t" + "pmull2 v19.1q, v10.2d, v5.2d\n\t" + "ext v22.16b, v10.16b, v10.16b, #8\n\t" + "pmull v23.1q, v22.1d, v5.1d\n\t" + "pmull2 v22.1q, v22.2d, v5.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "pmull v20.1q, v11.1d, v4.1d\n\t" + "pmull2 v21.1q, v11.2d, v4.2d\n\t" + "ext v22.16b, v11.16b, v11.16b, #8\n\t" + "pmull v23.1q, v22.1d, v4.1d\n\t" + "pmull2 v22.1q, v22.2d, v4.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v12.1d, v3.1d\n\t" + "pmull2 v21.1q, v12.2d, v3.2d\n\t" + "ext v22.16b, v12.16b, v12.16b, #8\n\t" + "pmull v23.1q, v22.1d, v3.1d\n\t" + "pmull2 v22.1q, v22.2d, v3.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "pmull v20.1q, v13.1d, v1.1d\n\t" + "pmull2 v21.1q, v13.2d, v1.2d\n\t" + "ext v22.16b, v13.16b, v13.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v20.16b, v20.16b, v23.16b\n\t" + "eor v21.16b, v21.16b, v22.16b\n\t" + "eor v18.16b, v18.16b, v20.16b\n\t" + "eor v19.16b, v19.16b, v21.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v0.16b, v19.16b, v18.16b\n\t" + "sub %w[blocks], %w[blocks], #4\n\t" + "b L_AES_GCMSIV_polyval_pmull_loop4_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_done4_%=:\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_rem_%=:\n\t" + "cbz %w[blocks], L_AES_GCMSIV_polyval_pmull_done_%=\n\t" + "ld1 {v10.16b}, [%x[data]], #16\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "pmull v18.1q, v0.1d, v1.1d\n\t" + "pmull2 v19.1q, v0.2d, v1.2d\n\t" + "ext v22.16b, v0.16b, v0.16b, #8\n\t" + "pmull v23.1q, v22.1d, v1.1d\n\t" + "pmull2 v22.1q, v22.2d, v1.2d\n\t" + "eor v22.16b, v22.16b, v23.16b\n\t" + "ext v23.16b, v2.16b, v22.16b, #8\n\t" + "ext v22.16b, v22.16b, v2.16b, #8\n\t" + "eor v18.16b, v18.16b, v23.16b\n\t" + "eor v19.16b, v19.16b, v22.16b\n\t" + "ushr v24.4s, v18.4s, #31\n\t" + "ushr v25.4s, v19.4s, #31\n\t" + "shl v18.4s, v18.4s, #1\n\t" + "shl v19.4s, v19.4s, #1\n\t" + "ext v26.16b, v24.16b, v2.16b, #12\n\t" + "ext v25.16b, v2.16b, v25.16b, #12\n\t" + "ext v24.16b, v2.16b, v24.16b, #12\n\t" + "orr v18.16b, v18.16b, v24.16b\n\t" + "orr v19.16b, v19.16b, v25.16b\n\t" + "orr v19.16b, v19.16b, v26.16b\n\t" + "shl v24.4s, v18.4s, #31\n\t" + "shl v25.4s, v18.4s, #30\n\t" + "shl v26.4s, v18.4s, #25\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "ext v25.16b, v24.16b, v2.16b, #4\n\t" + "ext v24.16b, v2.16b, v24.16b, #4\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "ushr v24.4s, v18.4s, #1\n\t" + "ushr v26.4s, v18.4s, #2\n\t" + "ushr v27.4s, v18.4s, #7\n\t" + "eor v24.16b, v24.16b, v26.16b\n\t" + "eor v24.16b, v24.16b, v27.16b\n\t" + "eor v24.16b, v24.16b, v25.16b\n\t" + "eor v18.16b, v18.16b, v24.16b\n\t" + "eor v0.16b, v19.16b, v18.16b\n\t" + "subs %w[blocks], %w[blocks], #1\n\t" + "b.ne L_AES_GCMSIV_polyval_pmull_rem_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_pmull_done_%=:\n\t" + "rev64 v0.16b, v0.16b\n\t" + "ext v0.16b, v0.16b, v0.16b, #8\n\t" + "st1 {v0.16b}, [%x[s]]\n\t" + : [s] "+r" (s), [blocks] "+r" (blocks) + : [h] "r" (h), [data] "r" (data) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" ); } -#endif /* HAVE_AES_DECRYPT */ -XALIGNED(8) static const word32 L_AES_ARM64_NEON_rcon[] = { - 0x01000000, 0x02000000, 0x04000000, 0x08000000, - 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000 -}; - -void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, - unsigned char* ks); -void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, - unsigned char* ks) +void AES_GCMSIV_ctr_aarch64(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, unsigned char* ctr) { - const word32* rcon = L_AES_ARM64_NEON_rcon; - const word8* te = L_AES_ARM64_NEON_te; __asm__ __volatile__ ( - "ld1 {v6.16b, v7.16b, v8.16b, v9.16b}, [%[te]], #0x40\n\t" - "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[te]], #0x40\n\t" - "ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%[te]], #0x40\n\t" - "ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%[te]]\n\t" - "movi v2.16b, #0x40\n\t" - "movi v3.16b, #0x80\n\t" - "movi v4.16b, #0xc0\n\t" - "movi v5.16b, #27\n\t" - "eor v26.16b, v26.16b, v26.16b\n\t" - "cmp %w[len], #0x80\n\t" - "b.eq L_AES_set_encrypt_key_NEON_start_128_%=\n\t" - "cmp %w[len], #0xc0\n\t" - "b.eq L_AES_set_encrypt_key_NEON_start_192_%=\n\t" - "ld1 {v0.16b}, [%x[key]], #16\n\t" - "ld1 {v1.16b}, [%x[key]]\n\t" - "rev32 v0.16b, v0.16b\n\t" - "rev32 v1.16b, v1.16b\n\t" - "st1 {v0.2d}, [%x[ks]], #16\n\t" - "st1 {v1.2d}, [%x[ks]], #16\n\t" - "mov x3, #6\n\t" + "ld1 {v15.2d}, [%x[ctr]]\n\t" + "mov w7, v15.s[0]\n\t" + "lsr x6, %x[length], #4\n\t" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[KS]], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[KS]], #0x40\n\t" + "cmp %w[nr], #12\n\t" + "b.lt L_AES_GCMSIV_ctr_aarch64_start_128_%=\n\t" + "b.gt L_AES_GCMSIV_ctr_aarch64_start_256_%=\n\t" + /* AES_GCMSIV_CTR_192 */ +#ifndef NO_AES_192 + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[KS]], #0x40\n\t" + "ld1 {v12.2d}, [%x[KS]]\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_192_loop4_%=:\n\t" + "cmp x6, #4\n\t" + "b.lt L_AES_GCMSIV_ctr_aarch64_192_done4_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "mov v17.16b, v15.16b\n\t" + "add w8, w7, #1\n\t" + "mov v17.s[0], w8\n\t" + "mov v18.16b, v15.16b\n\t" + "add w8, w7, #2\n\t" + "mov v18.s[0], w8\n\t" + "mov v19.16b, v15.16b\n\t" + "add w8, w7, #3\n\t" + "mov v19.s[0], w8\n\t" + "add w7, w7, #4\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[in]], #0x40\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v0.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v0.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v0.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v1.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v1.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v1.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v2.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v2.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v2.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v3.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v3.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v3.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v4.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v4.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v4.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v5.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v5.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v5.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v6.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v6.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v6.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v7.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v7.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v7.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v8.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v8.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v8.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v9.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v9.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v9.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v10.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v10.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v10.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v10.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v11.16b\n\t" + "eor v16.16b, v16.16b, v12.16b\n\t" + "aese v17.16b, v11.16b\n\t" + "eor v17.16b, v17.16b, v12.16b\n\t" + "aese v18.16b, v11.16b\n\t" + "eor v18.16b, v18.16b, v12.16b\n\t" + "aese v19.16b, v11.16b\n\t" + "eor v19.16b, v19.16b, v12.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "eor v21.16b, v21.16b, v17.16b\n\t" + "eor v22.16b, v22.16b, v18.16b\n\t" + "eor v23.16b, v23.16b, v19.16b\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[out]], #0x40\n\t" + "sub x6, x6, #4\n\t" + "cmp x6, #4\n\t" + "b.ge L_AES_GCMSIV_ctr_aarch64_192_loop4_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_192_done4_%=:\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_192_loop1_%=:\n\t" + "cbz x6, L_AES_GCMSIV_ctr_aarch64_192_done1_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "add w7, w7, #1\n\t" + "ld1 {v20.16b}, [%x[in]], #16\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v10.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v11.16b\n\t" + "eor v16.16b, v16.16b, v12.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "st1 {v20.16b}, [%x[out]], #16\n\t" + "sub x6, x6, #1\n\t" + "b L_AES_GCMSIV_ctr_aarch64_192_loop1_%=\n\t" "\n" - "L_AES_set_encrypt_key_NEON_loop_256_%=:\n\t" - "eor v22.16b, v1.16b, v2.16b\n\t" - "eor v23.16b, v1.16b, v3.16b\n\t" - "eor v24.16b, v1.16b, v4.16b\n\t" - "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" - "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" - "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" - "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" - "orr v25.16b, v25.16b, v22.16b\n\t" - "orr v23.16b, v23.16b, v24.16b\n\t" - "orr v25.16b, v25.16b, v23.16b\n\t" - "ext v25.16b, v25.16b, v26.16b, #12\n\t" - "shl v22.4s, v25.4s, #8\n\t" - "sri v22.4s, v25.4s, #24\n\t" - "eor v0.16b, v0.16b, v22.16b\n\t" - "ld1r {v25.4s}, [%[rcon]], #4\n\t" - "dup v22.4s, v0.s[0]\n\t" - "dup v23.2s, v0.s[1]\n\t" - "dup v24.2s, v0.s[2]\n\t" - "ext v22.16b, v26.16b, v22.16b, #12\n\t" - "ext v23.16b, v26.16b, v23.16b, #8\n\t" - "eor v0.16b, v0.16b, v22.16b\n\t" - "ext v24.16b, v26.16b, v24.16b, #4\n\t" - "eor v0.16b, v0.16b, v23.16b\n\t" - "eor v0.16b, v0.16b, v24.16b\n\t" - "eor v0.16b, v0.16b, v25.16b\n\t" - "st1 {v0.2d}, [%x[ks]], #16\n\t" - "eor v22.16b, v0.16b, v2.16b\n\t" - "eor v23.16b, v0.16b, v3.16b\n\t" - "eor v24.16b, v0.16b, v4.16b\n\t" - "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b\n\t" - "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" - "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" - "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" - "orr v25.16b, v25.16b, v22.16b\n\t" - "orr v23.16b, v23.16b, v24.16b\n\t" - "orr v25.16b, v25.16b, v23.16b\n\t" - "ext v25.16b, v25.16b, v26.16b, #12\n\t" - "eor v1.16b, v1.16b, v25.16b\n\t" - "dup v22.4s, v1.s[0]\n\t" - "dup v23.2s, v1.s[1]\n\t" - "dup v24.2s, v1.s[2]\n\t" - "ext v22.16b, v26.16b, v22.16b, #12\n\t" - "ext v23.16b, v26.16b, v23.16b, #8\n\t" - "eor v1.16b, v1.16b, v22.16b\n\t" - "ext v24.16b, v26.16b, v24.16b, #4\n\t" - "eor v1.16b, v1.16b, v23.16b\n\t" - "eor v1.16b, v1.16b, v24.16b\n\t" - "st1 {v1.2d}, [%x[ks]], #16\n\t" - "subs x3, x3, #1\n\t" - "b.ne L_AES_set_encrypt_key_NEON_loop_256_%=\n\t" - "eor v22.16b, v1.16b, v2.16b\n\t" - "eor v23.16b, v1.16b, v3.16b\n\t" - "eor v24.16b, v1.16b, v4.16b\n\t" - "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" - "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" - "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" - "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" - "orr v25.16b, v25.16b, v22.16b\n\t" - "orr v23.16b, v23.16b, v24.16b\n\t" - "orr v25.16b, v25.16b, v23.16b\n\t" - "ext v25.16b, v25.16b, v26.16b, #12\n\t" - "shl v22.4s, v25.4s, #8\n\t" - "sri v22.4s, v25.4s, #24\n\t" - "eor v0.16b, v0.16b, v22.16b\n\t" - "ld1r {v25.4s}, [%[rcon]], #4\n\t" - "dup v22.4s, v0.s[0]\n\t" - "dup v23.2s, v0.s[1]\n\t" - "dup v24.2s, v0.s[2]\n\t" - "ext v22.16b, v26.16b, v22.16b, #12\n\t" - "ext v23.16b, v26.16b, v23.16b, #8\n\t" - "eor v0.16b, v0.16b, v22.16b\n\t" - "ext v24.16b, v26.16b, v24.16b, #4\n\t" - "eor v0.16b, v0.16b, v23.16b\n\t" - "eor v0.16b, v0.16b, v24.16b\n\t" - "eor v0.16b, v0.16b, v25.16b\n\t" + "L_AES_GCMSIV_ctr_aarch64_192_done1_%=:\n\t" +#endif /* !NO_AES_192 */ + "b L_AES_GCMSIV_ctr_aarch64_done_%=\n\t" + /* AES_GCMSIV_CTR_256 */ + "\n" + "L_AES_GCMSIV_ctr_aarch64_start_256_%=:\n\t" +#ifndef NO_AES_256 + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [%x[KS]], #0x40\n\t" + "ld1 {v12.2d, v13.2d}, [%x[KS]], #32\n\t" + "ld1 {v14.2d}, [%x[KS]]\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_256_loop4_%=:\n\t" + "cmp x6, #4\n\t" + "b.lt L_AES_GCMSIV_ctr_aarch64_256_done4_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "mov v17.16b, v15.16b\n\t" + "add w8, w7, #1\n\t" + "mov v17.s[0], w8\n\t" + "mov v18.16b, v15.16b\n\t" + "add w8, w7, #2\n\t" + "mov v18.s[0], w8\n\t" + "mov v19.16b, v15.16b\n\t" + "add w8, w7, #3\n\t" + "mov v19.s[0], w8\n\t" + "add w7, w7, #4\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[in]], #0x40\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v0.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v0.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v0.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v1.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v1.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v1.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v2.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v2.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v2.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v3.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v3.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v3.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v4.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v4.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v4.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v5.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v5.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v5.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v6.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v6.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v6.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v7.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v7.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v7.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v8.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v8.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v8.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v9.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v9.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v9.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v10.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v10.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v10.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v10.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v11.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v11.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v11.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v11.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v12.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v12.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v12.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v12.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v13.16b\n\t" + "eor v16.16b, v16.16b, v14.16b\n\t" + "aese v17.16b, v13.16b\n\t" + "eor v17.16b, v17.16b, v14.16b\n\t" + "aese v18.16b, v13.16b\n\t" + "eor v18.16b, v18.16b, v14.16b\n\t" + "aese v19.16b, v13.16b\n\t" + "eor v19.16b, v19.16b, v14.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "eor v21.16b, v21.16b, v17.16b\n\t" + "eor v22.16b, v22.16b, v18.16b\n\t" + "eor v23.16b, v23.16b, v19.16b\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[out]], #0x40\n\t" + "sub x6, x6, #4\n\t" + "cmp x6, #4\n\t" + "b.ge L_AES_GCMSIV_ctr_aarch64_256_loop4_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_256_done4_%=:\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_256_loop1_%=:\n\t" + "cbz x6, L_AES_GCMSIV_ctr_aarch64_256_done1_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "add w7, w7, #1\n\t" + "ld1 {v20.16b}, [%x[in]], #16\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v10.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v11.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v12.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v13.16b\n\t" + "eor v16.16b, v16.16b, v14.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "st1 {v20.16b}, [%x[out]], #16\n\t" + "sub x6, x6, #1\n\t" + "b L_AES_GCMSIV_ctr_aarch64_256_loop1_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_256_done1_%=:\n\t" +#endif /* !NO_AES_256 */ + "b L_AES_GCMSIV_ctr_aarch64_done_%=\n\t" + /* AES_GCMSIV_CTR_128 */ + "\n" + "L_AES_GCMSIV_ctr_aarch64_start_128_%=:\n\t" +#ifndef NO_AES_128 + "ld1 {v8.2d, v9.2d}, [%x[KS]], #32\n\t" + "ld1 {v10.2d}, [%x[KS]]\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_128_loop4_%=:\n\t" + "cmp x6, #4\n\t" + "b.lt L_AES_GCMSIV_ctr_aarch64_128_done4_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "mov v17.16b, v15.16b\n\t" + "add w8, w7, #1\n\t" + "mov v17.s[0], w8\n\t" + "mov v18.16b, v15.16b\n\t" + "add w8, w7, #2\n\t" + "mov v18.s[0], w8\n\t" + "mov v19.16b, v15.16b\n\t" + "add w8, w7, #3\n\t" + "mov v19.s[0], w8\n\t" + "add w7, w7, #4\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[in]], #0x40\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v0.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v0.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v0.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v1.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v1.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v1.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v2.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v2.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v2.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v3.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v3.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v3.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v4.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v4.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v4.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v5.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v5.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v5.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v6.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v6.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v6.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v7.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v7.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v7.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v17.16b, v8.16b\n\t" + "aesmc v17.16b, v17.16b\n\t" + "aese v18.16b, v8.16b\n\t" + "aesmc v18.16b, v18.16b\n\t" + "aese v19.16b, v8.16b\n\t" + "aesmc v19.16b, v19.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "eor v16.16b, v16.16b, v10.16b\n\t" + "aese v17.16b, v9.16b\n\t" + "eor v17.16b, v17.16b, v10.16b\n\t" + "aese v18.16b, v9.16b\n\t" + "eor v18.16b, v18.16b, v10.16b\n\t" + "aese v19.16b, v9.16b\n\t" + "eor v19.16b, v19.16b, v10.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "eor v21.16b, v21.16b, v17.16b\n\t" + "eor v22.16b, v22.16b, v18.16b\n\t" + "eor v23.16b, v23.16b, v19.16b\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[out]], #0x40\n\t" + "sub x6, x6, #4\n\t" + "cmp x6, #4\n\t" + "b.ge L_AES_GCMSIV_ctr_aarch64_128_loop4_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_128_done4_%=:\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_128_loop1_%=:\n\t" + "cbz x6, L_AES_GCMSIV_ctr_aarch64_128_done1_%=\n\t" + "mov v16.16b, v15.16b\n\t" + "mov v16.s[0], w7\n\t" + "add w7, w7, #1\n\t" + "ld1 {v20.16b}, [%x[in]], #16\n\t" + "aese v16.16b, v0.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v1.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v2.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v3.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v4.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v5.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v6.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v7.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v8.16b\n\t" + "aesmc v16.16b, v16.16b\n\t" + "aese v16.16b, v9.16b\n\t" + "eor v16.16b, v16.16b, v10.16b\n\t" + "eor v20.16b, v20.16b, v16.16b\n\t" + "st1 {v20.16b}, [%x[out]], #16\n\t" + "sub x6, x6, #1\n\t" + "b L_AES_GCMSIV_ctr_aarch64_128_loop1_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_aarch64_128_done1_%=:\n\t" +#endif /* !NO_AES_128 */ + "\n" + "L_AES_GCMSIV_ctr_aarch64_done_%=:\n\t" + "mov v15.s[0], w7\n\t" + "st1 {v15.2d}, [%x[ctr]]\n\t" + : [out] "+r" (out), [length] "+r" (length), [nr] "+r" (nr), + [ctr] "+r" (ctr) + : [in] "r" (in), [KS] "r" (KS) + : "memory", "cc", "x6", "x7", "x8", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); +} + +#endif /* WOLFSSL_AESGCM_SIV */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#ifndef WOLFSSL_ARMASM_NO_NEON +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +XALIGNED(4) static const word8 L_AES_ARM64_NEON_te[] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, + 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, + 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, + 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, + 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, + 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, + 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, + 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, + 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, + 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +XALIGNED(4) static const word8 L_AES_ARM64_NEON_shift_rows_shuffle[] = { + 0x0c, 0x09, 0x06, 0x03, 0x00, 0x0d, 0x0a, 0x07, + 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, +}; + +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +void AES_invert_key_NEON(unsigned char* ks, word32 rounds); +void AES_invert_key_NEON(unsigned char* ks, word32 rounds) +{ + __asm__ __volatile__ ( + "add x3, %x[ks], %x[rounds], lsl 4\n\t" + "mov x2, %x[ks]\n\t" + "mov w4, %w[rounds]\n\t" + "\n" + "L_AES_invert_key_NEON_loop_%=:\n\t" + "ld1 {v0.2d}, [x2]\n\t" + "ld1 {v1.2d}, [x3]\n\t" + "st1 {v0.2d}, [x3]\n\t" + "st1 {v1.2d}, [x2], #16\n\t" + "subs w4, w4, #2\n\t" + "sub x3, x3, #16\n\t" + "b.ne L_AES_invert_key_NEON_loop_%=\n\t" + "movi v2.16b, #27\n\t" + "add x2, %x[ks], #16\n\t" + "sub w4, %w[rounds], #1\n\t" + "\n" + "L_AES_invert_key_NEON_mix_loop_%=:\n\t" + "ld1 {v0.2d}, [x2]\n\t" + "sshr v5.16b, v0.16b, #7\n\t" + "ushr v6.16b, v0.16b, #6\n\t" + "ushr v3.16b, v0.16b, #5\n\t" + "and v5.16b, v5.16b, v2.16b\n\t" + "pmul v6.16b, v6.16b, v2.16b\n\t" + "pmul v3.16b, v3.16b, v2.16b\n\t" + "shl v4.16b, v0.16b, #1\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "shl v4.16b, v0.16b, #3\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "eor v6.16b, v6.16b, v4.16b\n\t" + "eor v4.16b, v5.16b, v3.16b\n\t" + "eor v3.16b, v3.16b, v0.16b\n\t" + "eor v5.16b, v6.16b, v3.16b\n\t" + "eor v6.16b, v6.16b, v4.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "shl v0.4s, v4.4s, #8\n\t" + "rev32 v5.8h, v5.8h\n\t" + "sri v0.4s, v4.4s, #24\n\t" + "eor v0.16b, v0.16b, v6.16b\n\t" + "shl v4.4s, v3.4s, #24\n\t" + "eor v0.16b, v0.16b, v5.16b\n\t" + "sri v4.4s, v3.4s, #8\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "st1 {v0.2d}, [x2], #16\n\t" + "subs w4, w4, #1\n\t" + "b.ne L_AES_invert_key_NEON_mix_loop_%=\n\t" + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : + : "memory", "cc", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", + "v6" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +XALIGNED(8) static const word32 L_AES_ARM64_NEON_rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1b000000, 0x36000000 +}; + +void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, + unsigned char* ks); +void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, + unsigned char* ks) +{ + const word32* rcon = L_AES_ARM64_NEON_rcon; + const word8* te = L_AES_ARM64_NEON_te; + __asm__ __volatile__ ( + "ld1 {v6.16b, v7.16b, v8.16b, v9.16b}, [%[te]], #0x40\n\t" + "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[te]], #0x40\n\t" + "ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%[te]], #0x40\n\t" + "ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%[te]]\n\t" + "movi v2.16b, #0x40\n\t" + "movi v3.16b, #0x80\n\t" + "movi v4.16b, #0xc0\n\t" + "movi v5.16b, #27\n\t" + "eor v26.16b, v26.16b, v26.16b\n\t" + "cmp %w[len], #0x80\n\t" + "b.eq L_AES_set_encrypt_key_NEON_start_128_%=\n\t" + "cmp %w[len], #0xc0\n\t" + "b.eq L_AES_set_encrypt_key_NEON_start_192_%=\n\t" + "ld1 {v0.16b}, [%x[key]], #16\n\t" + "ld1 {v1.16b}, [%x[key]]\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "st1 {v1.2d}, [%x[ks]], #16\n\t" + "mov x3, #6\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_loop_256_%=:\n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "eor v22.16b, v0.16b, v2.16b\n\t" + "eor v23.16b, v0.16b, v3.16b\n\t" + "eor v24.16b, v0.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "eor v1.16b, v1.16b, v25.16b\n\t" + "dup v22.4s, v1.s[0]\n\t" + "dup v23.2s, v1.s[1]\n\t" + "dup v24.2s, v1.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v1.16b, v1.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v1.16b, v1.16b, v23.16b\n\t" + "eor v1.16b, v1.16b, v24.16b\n\t" + "st1 {v1.2d}, [%x[ks]], #16\n\t" + "subs x3, x3, #1\n\t" + "b.ne L_AES_set_encrypt_key_NEON_loop_256_%=\n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" "st1 {v0.2d}, [%x[ks]], #16\n\t" "b L_AES_set_encrypt_key_NEON_end_%=\n\t" "\n" @@ -48851,6 +49965,8 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, const word8* te = L_AES_ARM64_NEON_te; const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" @@ -49896,6 +51012,7 @@ void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "st1 {v0.16b}, [%x[out]]\n\t" "\n" "L_AES_XTS_encrypt_NEON_data_done_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) : [in] "r" (in), [i] "r" (i), [te] "r" (te), [shuffle] "r" (shuffle) @@ -49916,6 +51033,8 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; const word8* invshuffle = L_AES_ARM64_NEON_shift_rows_invshuffle; __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" @@ -50903,18 +52022,1543 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "shl v11.4s, v1.4s, #24\n\t" "eor v4.16b, v4.16b, v8.16b\n\t" "eor v5.16b, v5.16b, v9.16b\n\t" - "sri v10.4s, v0.4s, #8\n\t" - "sri v11.4s, v1.4s, #8\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "eor v5.16b, v5.16b, v11.16b\n\t" - /* XOR in Key Schedule */ - "ld1 {v0.2d}, [x25], #16\n\t" + "sri v10.4s, v0.4s, #8\n\t" + "sri v11.4s, v1.4s, #8\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x25], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "eor v1.16b, v1.16b, v3.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x9, x11, x10, #63\n\t" + "eor x8, x16, x10, lsl 1\n\t" + "sub %w[sz], %w[sz], #32\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_1_%=:\n\t" + "ld1 {v3.2d}, [%[invshuffle]]\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "cmp %w[sz], #16\n\t" + "b.lt L_AES_XTS_decrypt_NEON_start_partial_%=\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_1_%=:\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "sub %w[sz], %w[sz], #16\n\t" + "cbz w19, L_AES_XTS_decrypt_NEON_data_done_%=\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x9, x9, x8, #63\n\t" + "eor x8, x16, x8, lsl 1\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_partial_%=:\n\t" + "mov %w[sz], w19\n\t" + "cbz %w[sz], L_AES_XTS_decrypt_NEON_data_done_%=\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "mov v1.d[0], x10\n\t" + "mov v1.d[1], x11\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=:\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "st1 {v0.2d}, [%x[tmp]]\n\t" + "add %x[out], %x[out], #16\n\t" + "mov w16, %w[sz]\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_byte_%=:\n\t" + "ldrb w10, [%x[tmp]]\n\t" + "ldrb w11, [%x[in]], #1\n\t" + "strb w10, [%x[out]], #1\n\t" + "strb w11, [%x[tmp]], #1\n\t" + "subs w16, w16, #1\n\t" + "b.gt L_AES_XTS_decrypt_NEON_start_byte_%=\n\t" + "sub %x[out], %x[out], %x[sz]\n\t" + "sub %x[tmp], %x[tmp], %x[sz]\n\t" + "sub %x[out], %x[out], #16\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.2d}, [%x[tmp]]\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=:\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]]\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_data_done_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), + [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) + : [in] "r" (in), [i] "r" (i), [te] "r" (te), [td] "r" (td), + [shuffle] "r" (shuffle), [invshuffle] "r" (invshuffle) + : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x19", "x24", "x25", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +void AES_GCMSIV_polyval_neon(unsigned char* s, const unsigned char* h, + const unsigned char* data, unsigned int blocks) +{ + __asm__ __volatile__ ( + "ld1 {v18.2d}, [%x[s]]\n\t" + "ld1 {v10.2d}, [%x[h]]\n\t" + "movi v19.16b, #15\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "rev64 v10.16b, v10.16b\n\t" + "ext v10.16b, v10.16b, v10.16b, #8\n\t" + "rbit v10.16b, v10.16b\n\t" + "rbit v18.16b, v18.16b\n\t" + "and v12.16b, v10.16b, v19.16b\n\t" + "ushr v13.16b, v10.16b, #4\n\t" + "eor v14.16b, v12.16b, v13.16b\n\t" + "cbz %w[blocks], L_AES_GCMSIV_polyval_neon_done_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_neon_loop_%=:\n\t" + "ld1 {v0.16b}, [%x[data]], #16\n\t" + "rev64 v0.16b, v0.16b\n\t" + "ext v0.16b, v0.16b, v0.16b, #8\n\t" + "rbit v0.16b, v0.16b\n\t" + "eor v18.16b, v18.16b, v0.16b\n\t" + "and v15.16b, v18.16b, v19.16b\n\t" + "ushr v16.16b, v18.16b, #4\n\t" + "eor v17.16b, v15.16b, v16.16b\n\t" + "dup v0.16b, v12.b[0]\n\t" + "dup v2.16b, v14.b[0]\n\t" + "dup v1.16b, v13.b[0]\n\t" + "pmul v8.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "dup v0.16b, v12.b[1]\n\t" + "dup v2.16b, v14.b[1]\n\t" + "dup v1.16b, v13.b[1]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v6.16b, v20.16b, v3.16b, #15\n\t" + "ext v9.16b, v3.16b, v20.16b, #15\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[2]\n\t" + "dup v2.16b, v14.b[2]\n\t" + "dup v1.16b, v13.b[2]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #14\n\t" + "ext v6.16b, v20.16b, v3.16b, #14\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[3]\n\t" + "dup v2.16b, v14.b[3]\n\t" + "dup v1.16b, v13.b[3]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #13\n\t" + "ext v6.16b, v20.16b, v3.16b, #13\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[4]\n\t" + "dup v2.16b, v14.b[4]\n\t" + "dup v1.16b, v13.b[4]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #12\n\t" + "ext v6.16b, v20.16b, v3.16b, #12\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[5]\n\t" + "dup v2.16b, v14.b[5]\n\t" + "dup v1.16b, v13.b[5]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #11\n\t" + "ext v6.16b, v20.16b, v3.16b, #11\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[6]\n\t" + "dup v2.16b, v14.b[6]\n\t" + "dup v1.16b, v13.b[6]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #10\n\t" + "ext v6.16b, v20.16b, v3.16b, #10\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[7]\n\t" + "dup v2.16b, v14.b[7]\n\t" + "dup v1.16b, v13.b[7]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #9\n\t" + "ext v6.16b, v20.16b, v3.16b, #9\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[8]\n\t" + "dup v2.16b, v14.b[8]\n\t" + "dup v1.16b, v13.b[8]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #8\n\t" + "ext v6.16b, v20.16b, v3.16b, #8\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[9]\n\t" + "dup v2.16b, v14.b[9]\n\t" + "dup v1.16b, v13.b[9]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #7\n\t" + "ext v6.16b, v20.16b, v3.16b, #7\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[10]\n\t" + "dup v2.16b, v14.b[10]\n\t" + "dup v1.16b, v13.b[10]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #6\n\t" + "ext v6.16b, v20.16b, v3.16b, #6\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[11]\n\t" + "dup v2.16b, v14.b[11]\n\t" + "dup v1.16b, v13.b[11]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #5\n\t" + "ext v6.16b, v20.16b, v3.16b, #5\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[12]\n\t" + "dup v2.16b, v14.b[12]\n\t" + "dup v1.16b, v13.b[12]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #4\n\t" + "ext v6.16b, v20.16b, v3.16b, #4\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[13]\n\t" + "dup v2.16b, v14.b[13]\n\t" + "dup v1.16b, v13.b[13]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #3\n\t" + "ext v6.16b, v20.16b, v3.16b, #3\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[14]\n\t" + "dup v2.16b, v14.b[14]\n\t" + "dup v1.16b, v13.b[14]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #2\n\t" + "ext v6.16b, v20.16b, v3.16b, #2\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[15]\n\t" + "dup v2.16b, v14.b[15]\n\t" + "dup v1.16b, v13.b[15]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #1\n\t" + "ext v6.16b, v20.16b, v3.16b, #1\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "shl v0.16b, v9.16b, #1\n\t" + "shl v1.16b, v9.16b, #2\n\t" + "shl v2.16b, v9.16b, #7\n\t" + "ushr v3.16b, v9.16b, #7\n\t" + "ushr v4.16b, v9.16b, #6\n\t" + "ushr v5.16b, v9.16b, #1\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "ext v0.16b, v20.16b, v3.16b, #15\n\t" + "ext v1.16b, v20.16b, v4.16b, #15\n\t" + "ext v2.16b, v20.16b, v5.16b, #15\n\t" + "ext v4.16b, v4.16b, v20.16b, #15\n\t" + "ext v5.16b, v5.16b, v20.16b, #15\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v3.16b, v4.16b, v5.16b\n\t" + "shl v0.2d, v3.2d, #1\n\t" + "shl v1.2d, v3.2d, #2\n\t" + "shl v2.2d, v3.2d, #7\n\t" + "eor v3.16b, v3.16b, v0.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v8.16b, v8.16b, v3.16b\n\t" + "eor v18.16b, v8.16b, v1.16b\n\t" + "subs %w[blocks], %w[blocks], #1\n\t" + "b.ne L_AES_GCMSIV_polyval_neon_loop_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_neon_done_%=:\n\t" + "rbit v18.16b, v18.16b\n\t" + "st1 {v18.2d}, [%x[s]]\n\t" + : [s] "+r" (s), [blocks] "+r" (blocks) + : [h] "r" (h), [data] "r" (data) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20" + ); +} + +XALIGNED(4) static const word8 L_AES_GCMSIV_ctr_neon_te[] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, + 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, + 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, + 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, + 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, + 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, + 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, + 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, + 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, + 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +XALIGNED(4) static const word8 L_AES_GCMSIV_ctr_neon_shuffle[] = { + 0x0c, 0x09, 0x06, 0x03, 0x00, 0x0d, 0x0a, 0x07, + 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, +}; + +void AES_GCMSIV_ctr_neon(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, unsigned char* ctr) +{ + const word8* te = L_AES_GCMSIV_ctr_neon_te; + const word8* shuffle = L_AES_GCMSIV_ctr_neon_shuffle; + __asm__ __volatile__ ( + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "ldr w10, [%x[ctr]]\n\t" + "cmp %x[length], #0x40\n\t" + "b.lt L_AES_GCMSIV_ctr_neon_start_2_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_neon_loop_4_%=:\n\t" + "mov x9, %x[KS]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* Round: 0 - build counters and XOR in key schedule */ + "ld1 {v0.2d}, [%x[ctr]]\n\t" + "mov v0.s[0], w10\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "ld1 {v1.2d}, [%x[ctr]]\n\t" + "add w8, w10, #1\n\t" + "mov v1.s[0], w8\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "ld1 {v2.2d}, [%x[ctr]]\n\t" + "add w8, w10, #2\n\t" + "mov v2.s[0], w8\n\t" + "rev32 v2.16b, v2.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "ld1 {v3.2d}, [%x[ctr]]\n\t" + "add w8, w10, #3\n\t" + "mov v3.s[0], w8\n\t" + "rev32 v3.16b, v3.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "add w10, w10, #4\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_GCMSIV_ctr_neon_loop_nr_4_%=:\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "movi v4.16b, #27\n\t" + "and v8.16b, v8.16b, v4.16b\n\t" + "and v9.16b, v9.16b, v4.16b\n\t" + "and v10.16b, v10.16b, v4.16b\n\t" + "and v11.16b, v11.16b, v4.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "eor v6.16b, v10.16b, v2.16b\n\t" + "eor v7.16b, v11.16b, v3.16b\n\t" + "shl v12.4s, v4.4s, #8\n\t" + "shl v13.4s, v5.4s, #8\n\t" + "shl v14.4s, v6.4s, #8\n\t" + "shl v15.4s, v7.4s, #8\n\t" + "sri v12.4s, v4.4s, #24\n\t" + "sri v13.4s, v5.4s, #24\n\t" + "sri v14.4s, v6.4s, #24\n\t" + "sri v15.4s, v7.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "shl v6.4s, v2.4s, #24\n\t" + "shl v7.4s, v3.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "sri v6.4s, v2.4s, #8\n\t" + "sri v7.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_GCMSIV_ctr_neon_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "sub %x[length], %x[length], #0x40\n\t" + "cmp %x[length], #0x40\n\t" + "b.ge L_AES_GCMSIV_ctr_neon_loop_4_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_neon_start_2_%=:\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "cmp %x[length], #16\n\t" + "b.eq L_AES_GCMSIV_ctr_neon_start_1_%=\n\t" + "b.lt L_AES_GCMSIV_ctr_neon_data_done_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_neon_loop_2_%=:\n\t" + "mov x9, %x[KS]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* Round: 0 - build counters and XOR in key schedule */ + "ld1 {v0.2d}, [%x[ctr]]\n\t" + "mov v0.s[0], w10\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "ld1 {v1.2d}, [%x[ctr]]\n\t" + "add w8, w10, #1\n\t" + "mov v1.s[0], w8\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "add w10, w10, #2\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_GCMSIV_ctr_neon_loop_nr_2_%=:\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v10.16b, v0.16b, #1\n\t" + "shl v11.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "shl v10.4s, v4.4s, #8\n\t" + "shl v11.4s, v5.4s, #8\n\t" + "sri v10.4s, v4.4s, #24\n\t" + "sri v11.4s, v5.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_GCMSIV_ctr_neon_loop_nr_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" "eor v4.16b, v4.16b, v0.16b\n\t" "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" /* Round Done */ - "movi v12.16b, #0x40\n\t" - "movi v13.16b, #0x80\n\t" - "movi v14.16b, #0xc0\n\t" "eor v8.16b, v4.16b, v12.16b\n\t" "eor v9.16b, v5.16b, v12.16b\n\t" "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" @@ -50935,119 +53579,37 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" "orr v0.16b, v0.16b, v8.16b\n\t" "orr v1.16b, v1.16b, v9.16b\n\t" - "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" "tbl v0.16b, {v0.16b}, v4.16b\n\t" "tbl v1.16b, {v1.16b}, v4.16b\n\t" /* XOR in Key Schedule */ - "ld1 {v4.2d}, [x25], #16\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" "eor v0.16b, v0.16b, v4.16b\n\t" "eor v1.16b, v1.16b, v4.16b\n\t" /* Round Done */ "rev32 v0.16b, v0.16b\n\t" "rev32 v1.16b, v1.16b\n\t" - "eor v0.16b, v0.16b, v2.16b\n\t" - "eor v1.16b, v1.16b, v3.16b\n\t" + "ld1 {v4.16b, v5.16b}, [%x[in]], #32\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" - "and x16, x17, x11, asr 63\n\t" - "extr x9, x11, x10, #63\n\t" - "eor x8, x16, x10, lsl 1\n\t" - "sub %w[sz], %w[sz], #32\n\t" + "sub %x[length], %x[length], #32\n\t" + "cmp %x[length], #0\n\t" + "b.eq L_AES_GCMSIV_ctr_neon_data_done_%=\n\t" "\n" - "L_AES_XTS_decrypt_NEON_start_1_%=:\n\t" - "ld1 {v3.2d}, [%[invshuffle]]\n\t" - "mov v2.d[0], x8\n\t" - "mov v2.d[1], x9\n\t" - "cmp %w[sz], #16\n\t" - "b.lt L_AES_XTS_decrypt_NEON_start_partial_%=\n\t" - "mov x25, %x[key]\n\t" - "ld1 {v0.16b}, [%x[in]], #16\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - "eor v0.16b, v0.16b, v2.16b\n\t" + "L_AES_GCMSIV_ctr_neon_start_1_%=:\n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x9, %x[KS]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* Round: 0 - build counter and XOR in key schedule */ + "ld1 {v0.2d}, [%x[ctr]]\n\t" + "mov v0.s[0], w10\n\t" "rev32 v0.16b, v0.16b\n\t" "eor v0.16b, v0.16b, v4.16b\n\t" - "sub w24, %w[nr], #2\n\t" + "add w10, w10, #1\n\t" + "sub w8, %w[nr], #2\n\t" "\n" - "L_AES_XTS_decrypt_NEON_loop_nr_1_%=:\n\t" - "eor v8.16b, v0.16b, v12.16b\n\t" - "eor v9.16b, v0.16b, v13.16b\n\t" - "eor v10.16b, v0.16b, v14.16b\n\t" - "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v4.16b, v4.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v4.16b, v4.16b, v9.16b\n\t" - "tbl v4.16b, {v4.16b}, v3.16b\n\t" - "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v4.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" - "eor v8.16b, v4.16b, v12.16b\n\t" - "eor v9.16b, v4.16b, v13.16b\n\t" - "eor v10.16b, v4.16b, v14.16b\n\t" - "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v0.16b, v0.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v0.16b, v0.16b, v9.16b\n\t" - "tbl v0.16b, {v0.16b}, v3.16b\n\t" - "sshr v10.16b, v0.16b, #7\n\t" - "ushr v11.16b, v0.16b, #6\n\t" - "ushr v8.16b, v0.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v0.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v0.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v0.16b\n\t" - "shl v0.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v0.4s, v9.4s, #24\n\t" - "eor v0.16b, v0.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v0.16b, v0.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v0.16b, v0.16b, v9.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v0.16b, v0.16b, v4.16b\n\t" - "subs w24, w24, #2\n\t" - "b.ne L_AES_XTS_decrypt_NEON_loop_nr_1_%=\n\t" + "L_AES_GCMSIV_ctr_neon_loop_nr_1_%=:\n\t" "eor v8.16b, v0.16b, v12.16b\n\t" "eor v9.16b, v0.16b, v13.16b\n\t" "eor v10.16b, v0.16b, v14.16b\n\t" @@ -51059,115 +53621,22 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "orr v9.16b, v9.16b, v10.16b\n\t" "orr v4.16b, v4.16b, v9.16b\n\t" "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" "shl v9.16b, v4.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" - "eor v8.16b, v4.16b, v12.16b\n\t" - "eor v9.16b, v4.16b, v13.16b\n\t" - "eor v10.16b, v4.16b, v14.16b\n\t" - "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v0.16b, v0.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v0.16b, v0.16b, v9.16b\n\t" - "tbl v0.16b, {v0.16b}, v3.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v0.16b, v0.16b, v4.16b\n\t" - "rev32 v0.16b, v0.16b\n\t" - "eor v0.16b, v0.16b, v2.16b\n\t" - "st1 {v0.16b}, [%x[out]], #16\n\t" - "sub %w[sz], %w[sz], #16\n\t" - "cbz w19, L_AES_XTS_decrypt_NEON_data_done_%=\n\t" - "and x16, x17, x9, asr 63\n\t" - "extr x9, x9, x8, #63\n\t" - "eor x8, x16, x8, lsl 1\n\t" - "\n" - "L_AES_XTS_decrypt_NEON_start_partial_%=:\n\t" - "mov %w[sz], w19\n\t" - "cbz %w[sz], L_AES_XTS_decrypt_NEON_data_done_%=\n\t" - "mov v2.d[0], x8\n\t" - "mov v2.d[1], x9\n\t" - "and x16, x17, x9, asr 63\n\t" - "extr x11, x9, x8, #63\n\t" - "eor x10, x16, x8, lsl 1\n\t" - "mov v1.d[0], x10\n\t" - "mov v1.d[1], x11\n\t" - "mov x25, %x[key]\n\t" - "ld1 {v0.16b}, [%x[in]], #16\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - "eor v0.16b, v0.16b, v1.16b\n\t" - "rev32 v0.16b, v0.16b\n\t" - "eor v0.16b, v0.16b, v4.16b\n\t" - "sub w24, %w[nr], #2\n\t" - "\n" - "L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=:\n\t" - "eor v8.16b, v0.16b, v12.16b\n\t" - "eor v9.16b, v0.16b, v13.16b\n\t" - "eor v10.16b, v0.16b, v14.16b\n\t" - "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v4.16b, v4.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v4.16b, v4.16b, v9.16b\n\t" - "tbl v4.16b, {v4.16b}, v3.16b\n\t" - "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v4.16b, #1\n\t" "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" "eor v8.16b, v4.16b, v12.16b\n\t" "eor v9.16b, v4.16b, v13.16b\n\t" "eor v10.16b, v4.16b, v14.16b\n\t" @@ -51179,114 +53648,24 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "orr v9.16b, v9.16b, v10.16b\n\t" "orr v0.16b, v0.16b, v9.16b\n\t" "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" "sshr v10.16b, v0.16b, #7\n\t" - "ushr v11.16b, v0.16b, #6\n\t" - "ushr v8.16b, v0.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" "shl v9.16b, v0.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v0.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v0.16b\n\t" - "shl v0.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v0.4s, v9.4s, #24\n\t" - "eor v0.16b, v0.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v0.16b, v0.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v0.16b, v0.16b, v9.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v0.16b, v0.16b, v4.16b\n\t" - "subs w24, w24, #2\n\t" - "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=\n\t" - "eor v8.16b, v0.16b, v12.16b\n\t" - "eor v9.16b, v0.16b, v13.16b\n\t" - "eor v10.16b, v0.16b, v14.16b\n\t" - "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v4.16b, v4.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v4.16b, v4.16b, v9.16b\n\t" - "tbl v4.16b, {v4.16b}, v3.16b\n\t" - "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v4.16b, #1\n\t" "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" - "eor v8.16b, v4.16b, v12.16b\n\t" - "eor v9.16b, v4.16b, v13.16b\n\t" - "eor v10.16b, v4.16b, v14.16b\n\t" - "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v0.16b, v0.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v0.16b, v0.16b, v9.16b\n\t" - "tbl v0.16b, {v0.16b}, v3.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" /* XOR in Key Schedule */ - "eor v0.16b, v0.16b, v4.16b\n\t" - "rev32 v0.16b, v0.16b\n\t" - "eor v0.16b, v0.16b, v1.16b\n\t" - "st1 {v0.2d}, [%x[tmp]]\n\t" - "add %x[out], %x[out], #16\n\t" - "mov w16, %w[sz]\n\t" - "\n" - "L_AES_XTS_decrypt_NEON_start_byte_%=:\n\t" - "ldrb w10, [%x[tmp]]\n\t" - "ldrb w11, [%x[in]], #1\n\t" - "strb w10, [%x[out]], #1\n\t" - "strb w11, [%x[tmp]], #1\n\t" - "subs w16, w16, #1\n\t" - "b.gt L_AES_XTS_decrypt_NEON_start_byte_%=\n\t" - "sub %x[out], %x[out], %x[sz]\n\t" - "sub %x[tmp], %x[tmp], %x[sz]\n\t" - "sub %x[out], %x[out], #16\n\t" - "mov x25, %x[key]\n\t" - "ld1 {v0.2d}, [%x[tmp]]\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - "eor v0.16b, v0.16b, v2.16b\n\t" - "rev32 v0.16b, v0.16b\n\t" - "eor v0.16b, v0.16b, v4.16b\n\t" - "sub w24, %w[nr], #2\n\t" - "\n" - "L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=:\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w8, w8, #2\n\t" + "b.ne L_AES_GCMSIV_ctr_neon_loop_nr_1_%=\n\t" "eor v8.16b, v0.16b, v12.16b\n\t" "eor v9.16b, v0.16b, v13.16b\n\t" "eor v10.16b, v0.16b, v14.16b\n\t" @@ -51298,114 +53677,22 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "orr v9.16b, v9.16b, v10.16b\n\t" "orr v4.16b, v4.16b, v9.16b\n\t" "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" "shl v9.16b, v4.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" - "eor v8.16b, v4.16b, v12.16b\n\t" - "eor v9.16b, v4.16b, v13.16b\n\t" - "eor v10.16b, v4.16b, v14.16b\n\t" - "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v0.16b, v0.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v0.16b, v0.16b, v9.16b\n\t" - "tbl v0.16b, {v0.16b}, v3.16b\n\t" - "sshr v10.16b, v0.16b, #7\n\t" - "ushr v11.16b, v0.16b, #6\n\t" - "ushr v8.16b, v0.16b, #5\n\t" - "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v0.16b, #1\n\t" - "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v0.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v0.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v0.16b\n\t" - "shl v0.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v0.4s, v9.4s, #24\n\t" - "eor v0.16b, v0.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v0.16b, v0.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v0.16b, v0.16b, v9.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" - /* XOR in Key Schedule */ - "eor v0.16b, v0.16b, v4.16b\n\t" - "subs w24, w24, #2\n\t" - "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=\n\t" - "eor v8.16b, v0.16b, v12.16b\n\t" - "eor v9.16b, v0.16b, v13.16b\n\t" - "eor v10.16b, v0.16b, v14.16b\n\t" - "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" - "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" - "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" - "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" - "orr v4.16b, v4.16b, v8.16b\n\t" - "orr v9.16b, v9.16b, v10.16b\n\t" - "orr v4.16b, v4.16b, v9.16b\n\t" - "tbl v4.16b, {v4.16b}, v3.16b\n\t" - "sshr v10.16b, v4.16b, #7\n\t" - "ushr v11.16b, v4.16b, #6\n\t" - "ushr v8.16b, v4.16b, #5\n\t" "and v10.16b, v10.16b, v15.16b\n\t" - "pmul v11.16b, v11.16b, v15.16b\n\t" - "pmul v8.16b, v8.16b, v15.16b\n\t" - "shl v9.16b, v4.16b, #1\n\t" "eor v10.16b, v10.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #3\n\t" - "eor v8.16b, v8.16b, v9.16b\n\t" - "shl v9.16b, v4.16b, #2\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v10.16b, v8.16b\n\t" - "eor v8.16b, v8.16b, v4.16b\n\t" - "eor v10.16b, v11.16b, v8.16b\n\t" - "eor v11.16b, v11.16b, v9.16b\n\t" - "eor v9.16b, v9.16b, v4.16b\n\t" - "shl v4.4s, v9.4s, #8\n\t" - "rev32 v10.8h, v10.8h\n\t" - "sri v4.4s, v9.4s, #24\n\t" - "eor v4.16b, v4.16b, v11.16b\n\t" - "shl v9.4s, v8.4s, #24\n\t" - "eor v4.16b, v4.16b, v10.16b\n\t" - "sri v9.4s, v8.4s, #8\n\t" - "eor v4.16b, v4.16b, v9.16b\n\t" - "ld1 {v0.2d}, [x25], #16\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" /* XOR in Key Schedule */ - "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" "eor v8.16b, v4.16b, v12.16b\n\t" "eor v9.16b, v4.16b, v13.16b\n\t" "eor v10.16b, v4.16b, v14.16b\n\t" @@ -51417,28 +53704,27 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "orr v9.16b, v9.16b, v10.16b\n\t" "orr v0.16b, v0.16b, v9.16b\n\t" "tbl v0.16b, {v0.16b}, v3.16b\n\t" - "ld1 {v4.2d}, [x25], #16\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" /* XOR in Key Schedule */ "eor v0.16b, v0.16b, v4.16b\n\t" "rev32 v0.16b, v0.16b\n\t" - "eor v0.16b, v0.16b, v2.16b\n\t" - "st1 {v0.16b}, [%x[out]]\n\t" + "ld1 {v4.16b}, [%x[in]], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" "\n" - "L_AES_XTS_decrypt_NEON_data_done_%=:\n\t" - : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), - [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) - : [in] "r" (in), [i] "r" (i), [te] "r" (te), [td] "r" (td), - [shuffle] "r" (shuffle), [invshuffle] "r" (invshuffle) - : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", - "x16", "x17", "x19", "x24", "x25", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + "L_AES_GCMSIV_ctr_neon_data_done_%=:\n\t" + "str w10, [%x[ctr]]\n\t" + : [out] "+r" (out), [length] "+r" (length), [nr] "+r" (nr), + [ctr] "+r" (ctr) + : [in] "r" (in), [KS] "r" (KS), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x8", "x9", "x10", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); } -#endif /* HAVE_AES_DECRYPT */ -#endif /* WOLFSSL_AES_XTS */ +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NO_NEON */ #ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP #ifdef HAVE_AES_DECRYPT @@ -54610,6 +56896,8 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i, { const word32* te = L_AES_ARM64_te; __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "mov x9, #0x87\n\t" "mov x26, %x[key2]\n\t" "ldp x21, x22, [%x[i]]\n\t" @@ -55534,6 +57822,7 @@ void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i, "stp x10, x11, [%x[out]]\n\t" "\n" "L_AES_XTS_encrypt_done_data_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) : [in] "r" (in), [i] "r" (i), [te] "r" (te) @@ -55550,6 +57839,8 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i, const word8* td4 = L_AES_ARM64_td4; const word32* te = L_AES_ARM64_te; __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" "ands w11, %w[sz], #15\n\t" "cset w11, ne\n\t" "lsl w11, w11, #4\n\t" @@ -56699,6 +58990,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i, "stp x12, x13, [%x[out]]\n\t" "\n" "L_AES_XTS_decrypt_done_data_%=:\n\t" + "ldp x29, x30, [sp], #32\n\t" : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) : [in] "r" (in), [i] "r" (i), [td] "r" (td), [td4] "r" (td4), @@ -56711,6 +59003,791 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i, #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_AESGCM_SIV +XALIGNED(16) static const word64 L_AES_GCMSIV_polyval_base_r[] = { + 0x0000000000000000UL, 0x1c20000000000000UL, + 0x3840000000000000UL, 0x2460000000000000UL, + 0x7080000000000000UL, 0x6ca0000000000000UL, + 0x48c0000000000000UL, 0x54e0000000000000UL, + 0xe100000000000000UL, 0xfd20000000000000UL, + 0xd940000000000000UL, 0xc560000000000000UL, + 0x9180000000000000UL, 0x8da0000000000000UL, + 0xa9c0000000000000UL, 0xb5e0000000000000UL, +}; + +void AES_GCMSIV_polyval_base(unsigned char* s, const unsigned char* m, + const unsigned char* data, unsigned int blocks) +{ + const word64* r = L_AES_GCMSIV_polyval_base_r; + __asm__ __volatile__ ( + "cbz %w[blocks], L_AES_GCMSIV_polyval_base_done_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_base_loop_%=:\n\t" + "ldp x6, x7, [%x[data]]\n\t" + "ldp x4, x5, [%x[s]]\n\t" + "rev x6, x6\n\t" + "rev x7, x7\n\t" + "eor x4, x4, x7\n\t" + "eor x5, x5, x6\n\t" + "eor x8, x8, x8\n\t" + "eor x9, x9, x9\n\t" + "ubfx x12, x5, #56, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #60, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #48, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #52, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #40, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #44, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #32, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #36, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #24, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #28, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #16, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #20, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #8, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #12, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #0, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x5, #4, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #56, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #60, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #48, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #52, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #40, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #44, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #32, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #36, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #24, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #28, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #16, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #20, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #8, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #12, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #0, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "and x13, x9, #15\n\t" + "lsr x9, x9, #4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "lsr x8, x8, #4\n\t" + "ldr x10, [%[r], x13, LSL 3]\n\t" + "eor x8, x8, x10\n\t" + "ubfx x12, x4, #4, #4\n\t" + "add x14, %x[m], x12, lsl 4\n\t" + "ldp x10, x11, [x14]\n\t" + "eor x8, x8, x10\n\t" + "eor x9, x9, x11\n\t" + "rev x8, x8\n\t" + "rev x9, x9\n\t" + "stp x8, x9, [%x[s]]\n\t" + "subs %w[blocks], %w[blocks], #1\n\t" + "add %x[data], %x[data], #16\n\t" + "b.ne L_AES_GCMSIV_polyval_base_loop_%=\n\t" + "\n" + "L_AES_GCMSIV_polyval_base_done_%=:\n\t" + : [s] "+r" (s), [blocks] "+r" (blocks) + : [m] "r" (m), [data] "r" (data), [r] "r" (r) + : "memory", "cc", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "x13", "x14" + ); +} + +XALIGNED(8) static const word32 L_AES_GCMSIV_ctr_base_te[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +void AES_GCMSIV_ctr_base(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, unsigned char* ctr) +{ + const word32* te = L_AES_GCMSIV_ctr_base_te; + __asm__ __volatile__ ( + "ldp x15, x16, [%x[ctr]]\n\t" + "mov w17, w15\n\t" + "cbz %x[length], L_AES_GCMSIV_ctr_base_done_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_loop_block_%=:\n\t" + "mov x22, %x[KS]\n\t" + "ldp x11, x12, [x22], #16\n\t" + /* Round: 0 - set counter, XOR in key schedule */ + "bfi x15, x17, #0, #32\n\t" + "rev32 x7, x15\n\t" + "rev32 x8, x16\n\t" + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_loop_nr_%=:\n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x19, x8, #8, #8\n\t" + "ubfx x20, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w19, ror 8\n\t" + "ubfx x19, x8, #40, #8\n\t" + "eor w11, w11, w20, ror 16\n\t" + "ubfx x20, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w19, ror 8\n\t" + "ubfx x19, x7, #8, #8\n\t" + "eor w12, w12, w20, ror 16\n\t" + "ubfx x20, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w19, ror 8\n\t" + "ubfx x19, x8, #56, #8\n\t" + "eor w12, w13, w20, ror 16\n\t" + "ubfx x20, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w9, ror 24\n\t" + "ldp x7, x8, [x22], #16\n\t" + "eor w14, w14, w19, ror 24\n\t" + "eor w14, w14, w20, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x11, #48, #8\n\t" + "ubfx x10, x11, #24, #8\n\t" + "ubfx x19, x12, #8, #8\n\t" + "ubfx x20, x12, #32, #8\n\t" + "ldr x13, [%[te]]\n\t" + "ldr x13, [%[te], #64]\n\t" + "ldr x13, [%[te], #128]\n\t" + "ldr x13, [%[te], #192]\n\t" + "ldr x13, [%[te], #256]\n\t" + "ldr x13, [%[te], #320]\n\t" + "ldr x13, [%[te], #384]\n\t" + "ldr x13, [%[te], #448]\n\t" + "ldr x13, [%[te], #512]\n\t" + "ldr x13, [%[te], #576]\n\t" + "ldr x13, [%[te], #640]\n\t" + "ldr x13, [%[te], #704]\n\t" + "ldr x13, [%[te], #768]\n\t" + "ldr x13, [%[te], #832]\n\t" + "ldr x13, [%[te], #896]\n\t" + "ldr x13, [%[te], #960]\n\t" + "ldr w7, [%[te], x7, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x8, x12, #16, #8\n\t" + "eor w7, w7, w10, ror 24\n\t" + "ubfx x10, x11, #56, #8\n\t" + "eor w7, w7, w19, ror 8\n\t" + "ubfx x19, x12, #40, #8\n\t" + "eor w7, w7, w20, ror 16\n\t" + "ubfx x20, x11, #0, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x9, x12, #48, #8\n\t" + "eor w8, w8, w10, ror 24\n\t" + "ubfx x10, x12, #24, #8\n\t" + "eor w8, w8, w19, ror 8\n\t" + "ubfx x19, x11, #8, #8\n\t" + "eor w8, w8, w20, ror 16\n\t" + "ubfx x20, x11, #32, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x13, x12, #0, #8\n\t" + "eor w9, w9, w10, ror 24\n\t" + "ubfx x10, x11, #16, #8\n\t" + "eor w9, w9, w19, ror 8\n\t" + "ubfx x19, x12, #56, #8\n\t" + "eor w8, w9, w20, ror 16\n\t" + "ubfx x20, x11, #40, #8\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w13, ror 24\n\t" + "ldp x11, x12, [x22], #16\n\t" + "eor w10, w10, w19, ror 24\n\t" + "eor w10, w10, w20, ror 8\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "subs w21, w21, #2\n\t" + "b.ne L_AES_GCMSIV_ctr_base_loop_nr_%=\n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x19, x8, #8, #8\n\t" + "ubfx x20, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w19, ror 8\n\t" + "ubfx x19, x8, #40, #8\n\t" + "eor w11, w11, w20, ror 16\n\t" + "ubfx x20, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w19, ror 8\n\t" + "ubfx x19, x7, #8, #8\n\t" + "eor w12, w12, w20, ror 16\n\t" + "ubfx x20, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w19, ror 8\n\t" + "ubfx x19, x8, #56, #8\n\t" + "eor w12, w13, w20, ror 16\n\t" + "ubfx x20, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w9, ror 24\n\t" + "ldp x7, x8, [x22], #16\n\t" + "eor w14, w14, w19, ror 24\n\t" + "eor w14, w14, w20, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x12, #32, #8\n\t" + "ubfx x10, x12, #8, #8\n\t" + "ubfx x19, x11, #48, #8\n\t" + "ubfx x20, x11, #24, #8\n\t" + "lsl w7, w7, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldr x14, [%[te]]\n\t" + "ldr x14, [%[te], #64]\n\t" + "ldr x14, [%[te], #128]\n\t" + "ldr x14, [%[te], #192]\n\t" + "ldr x14, [%[te], #256]\n\t" + "ldr x14, [%[te], #320]\n\t" + "ldr x14, [%[te], #384]\n\t" + "ldr x14, [%[te], #448]\n\t" + "ldr x14, [%[te], #512]\n\t" + "ldr x14, [%[te], #576]\n\t" + "ldr x14, [%[te], #640]\n\t" + "ldr x14, [%[te], #704]\n\t" + "ldr x14, [%[te], #768]\n\t" + "ldr x14, [%[te], #832]\n\t" + "ldr x14, [%[te], #896]\n\t" + "ldr x14, [%[te], #960]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x8, x11, #0, #8\n\t" + "eor w7, w7, w10, lsl 8\n\t" + "ubfx x10, x12, #40, #8\n\t" + "eor w7, w7, w19, lsl 16\n\t" + "ubfx x19, x12, #16, #8\n\t" + "eor w7, w7, w20, lsl 24\n\t" + "ubfx x20, x11, #56, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x9, x11, #32, #8\n\t" + "eor w8, w8, w10, lsl 8\n\t" + "ubfx x10, x11, #8, #8\n\t" + "eor w8, w8, w19, lsl 16\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w8, w8, w20, lsl 24\n\t" + "ubfx x20, x12, #24, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x14, x12, #56, #8\n\t" + "eor w9, w9, w10, lsl 8\n\t" + "ubfx x10, x12, #0, #8\n\t" + "eor w9, w9, w19, lsl 16\n\t" + "ubfx x19, x11, #40, #8\n\t" + "eor w8, w9, w20, lsl 24\n\t" + "ubfx x20, x11, #16, #8\n\t" + "lsl w14, w14, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "eor w19, w19, w14, lsl 16\n\t" + "ldp x11, x12, [x22]\n\t" + "eor w10, w10, w19, lsl 8\n\t" + "eor w10, w10, w20, lsl 16\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "ldr x11, [%x[in]]\n\t" + "ldr x12, [%x[in], #8]\n\t" + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "str x7, [%x[out]]\n\t" + "str x8, [%x[out], #8]\n\t" + "add w17, w17, #1\n\t" + "subs %x[length], %x[length], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_GCMSIV_ctr_base_loop_block_%=\n\t" + "\n" + "L_AES_GCMSIV_ctr_base_done_%=:\n\t" + "bfi x15, x17, #0, #32\n\t" + "stp x15, x16, [%x[ctr]]\n\t" + : [out] "+r" (out), [length] "+r" (length), [nr] "+r" (nr), + [ctr] "+r" (ctr) + : [in] "r" (in), [KS] "r" (KS), [te] "r" (te) + : "memory", "cc", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x19", "x20", "x21", "x22" + ); +} + +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP */ #endif /* !defined(NO_AES) && defined(WOLFSSL_ARMASM) */ #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S index 2275959d2a..17b40c9a20 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S @@ -7262,6 +7262,1629 @@ L_AES_GCM_encrypt_end: /* Cycle Count = 1118 */ .size AES_GCM_encrypt,.-AES_GCM_encrypt #endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AESGCM_SIV +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_polyval_thumb2_r, %object + .size L_AES_GCMSIV_polyval_thumb2_r, 64 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + /* 8-byte aligned, 64-bit aligned */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_polyval_thumb2_r: + .long 0x00000000,0x1c200000,0x38400000,0x24600000 + .long 0x70800000,0x6ca00000,0x48c00000,0x54e00000 + .long 0xe1000000,0xfd200000,0xd9400000,0xc5600000 + .long 0x91800000,0x8da00000,0xa9c00000,0xb5e00000 + .text + .align 4 + .globl AES_GCMSIV_polyval_thumb2 + .type AES_GCMSIV_polyval_thumb2, %function +AES_GCMSIV_polyval_thumb2: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ADR r8, L_AES_GCMSIV_polyval_thumb2_r + CMP r3, #0x0 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_GCMSIV_polyval_thumb2_done +#else + BEQ.W L_AES_GCMSIV_polyval_thumb2_done +#endif +L_AES_GCMSIV_polyval_thumb2_loop: + LDR r12, [r2, #12] + REV r12, r12 + LDR r10, [r0] + EOR r10, r10, r12 + STR r10, [r0] + LDR r12, [r2, #8] + REV r12, r12 + LDR r10, [r0, #4] + EOR r10, r10, r12 + STR r10, [r0, #4] + LDR r12, [r2, #4] + REV r12, r12 + LDR r10, [r0, #8] + EOR r10, r10, r12 + STR r10, [r0, #8] + LDR r12, [r2] + REV r12, r12 + LDR r10, [r0, #12] + EOR r10, r10, r12 + STR r10, [r0, #12] + MOV r4, #0x0 + MOV r5, #0x0 + MOV r6, #0x0 + MOV r7, #0x0 + LDR r9, [r0, #12] + UBFX r10, r9, #24, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #28, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #16, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #20, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #8, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #12, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #0, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #4, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + LDR r9, [r0, #8] + UBFX r10, r9, #24, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #28, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #16, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #20, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #8, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #12, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #0, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #4, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + LDR r9, [r0, #4] + UBFX r10, r9, #24, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #28, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #16, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #20, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #8, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #12, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #0, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #4, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + LDR r9, [r0] + UBFX r10, r9, #24, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #28, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #16, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #20, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #8, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #12, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #0, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + AND r10, r6, #0xf + LSR r6, r6, #4 + ORR r6, r6, r7, LSL #28 + LSR r7, r7, #4 + ORR r7, r7, r4, LSL #28 + LSR r4, r4, #4 + ORR r4, r4, r5, LSL #28 + LSR r5, r5, #4 + LDR r12, [r8, r10, LSL #2] + EOR r5, r5, r12 + UBFX r10, r9, #4, #4 + ADD r11, r1, r10, LSL #4 + LDR r12, [r11] + EOR r4, r4, r12 + LDR r12, [r11, #4] + EOR r5, r5, r12 + LDR r12, [r11, #8] + EOR r6, r6, r12 + LDR r12, [r11, #12] + EOR r7, r7, r12 + REV r5, r5 + REV r4, r4 + REV r7, r7 + REV r6, r6 + STR r5, [r0] + STR r4, [r0, #4] + STR r7, [r0, #8] + STR r6, [r0, #12] + SUBS r3, r3, #0x1 + ADD r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_polyval_thumb2_loop +#else + BNE.W L_AES_GCMSIV_polyval_thumb2_loop +#endif +L_AES_GCMSIV_polyval_thumb2_done: + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 877 */ + .size AES_GCMSIV_polyval_thumb2,.-AES_GCMSIV_polyval_thumb2 +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_ctr_thumb2_te_data, %object + .size L_AES_GCMSIV_ctr_thumb2_te_data, 1024 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + /* 8-byte aligned, 64-bit aligned */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_thumb2_te_data: + .long 0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b + .long 0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5 + .long 0x50603030,0x03020101,0xa9ce6767,0x7d562b2b + .long 0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676 + .long 0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d + .long 0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0 + .long 0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf + .long 0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0 + .long 0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626 + .long 0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc + .long 0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1 + .long 0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515 + .long 0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3 + .long 0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a + .long 0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2 + .long 0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575 + .long 0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a + .long 0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0 + .long 0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3 + .long 0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484 + .long 0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded + .long 0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b + .long 0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939 + .long 0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf + .long 0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb + .long 0xc5864343,0xd79a4d4d,0x55663333,0x94118585 + .long 0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f + .long 0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8 + .long 0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f + .long 0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5 + .long 0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121 + .long 0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2 + .long 0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec + .long 0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717 + .long 0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d + .long 0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373 + .long 0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc + .long 0x66442222,0x7e542a2a,0xab3b9090,0x830b8888 + .long 0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414 + .long 0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb + .long 0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a + .long 0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c + .long 0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262 + .long 0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979 + .long 0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d + .long 0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9 + .long 0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea + .long 0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808 + .long 0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e + .long 0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6 + .long 0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f + .long 0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a + .long 0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666 + .long 0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e + .long 0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9 + .long 0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e + .long 0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111 + .long 0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494 + .long 0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9 + .long 0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf + .long 0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d + .long 0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868 + .long 0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f + .long 0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616 +#ifndef __APPLE__ + .text + .type L_AES_GCMSIV_ctr_thumb2_te, %object + .size L_AES_GCMSIV_ctr_thumb2_te, 12 +#else + .section __DATA,__data +#endif /* __APPLE__ */ + /* 8-byte aligned, 64-bit aligned */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_GCMSIV_ctr_thumb2_te: + .long L_AES_GCMSIV_ctr_thumb2_te_data + .text + .align 4 + .globl AES_GCMSIV_ctr_thumb2 + .type AES_GCMSIV_ctr_thumb2, %function +AES_GCMSIV_ctr_thumb2: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r12, [sp, #36] + LDR r8, [sp, #40] + MOV lr, r0 + LDR r0, L_AES_GCMSIV_ctr_thumb2_te + LDM r8, {r4, r5, r6, r7} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + PUSH {r3, r8} + CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_GCMSIV_ctr_thumb2_start_block_128 +#else + BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_128 +#endif + CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_GCMSIV_ctr_thumb2_start_block_192 +#else + BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_192 +#endif +L_AES_GCMSIV_ctr_thumb2_loop_block_256: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + REV r8, r4 + ADD r8, r8, #0x1 + REV r8, r8 + MOV r9, r5 + MOV r10, r6 + MOV r11, r7 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + BL AES_encrypt_block +#else +L_AES_GCMSIV_ctr_thumb2_block_nr_256: + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r9, #16, #8 + LSR r7, r8, #24 + UBFX lr, r10, #8, #8 + UBFX r2, r11, #0, #8 + LDR r4, [r0, r4, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r5, r10, #16, #8 + EOR r4, r4, r7, ROR #24 + LSR r7, r9, #24 + EOR r4, r4, lr, ROR #8 + UBFX lr, r11, #8, #8 + EOR r4, r4, r2, ROR #16 + UBFX r2, r8, #0, #8 + LDR r5, [r0, r5, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r11, #16, #8 + EOR r5, r5, r7, ROR #24 + LSR r7, r10, #24 + EOR r5, r5, lr, ROR #8 + UBFX lr, r8, #8, #8 + EOR r5, r5, r2, ROR #16 + UBFX r2, r9, #0, #8 + LDR r6, [r0, r6, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r10, #0, #8 + EOR r6, r6, r7, ROR #24 + UBFX r7, r8, #16, #8 + EOR r6, r6, lr, ROR #8 + LSR lr, r11, #24 + EOR r6, r6, r2, ROR #16 + UBFX r2, r9, #8, #8 + LDR r10, [r0, r10, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r10, ROR #24 + LDM r3!, {r8, r9, r10, r11} + EOR r7, r7, lr, ROR #24 + EOR r7, r7, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_block_nr_256 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_256 +#endif + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r11, #0, #8 + UBFX r7, r10, #8, #8 + UBFX lr, r9, #16, #8 + LSR r2, r8, #24 + LDRB r4, [r0, r4, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r5, r8, #0, #8 + EOR r4, r4, r7, LSL #8 + UBFX r7, r11, #8, #8 + EOR r4, r4, lr, LSL #16 + UBFX lr, r10, #16, #8 + EOR r4, r4, r2, LSL #24 + LSR r2, r9, #24 + LDRB r5, [r0, r5, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r6, r9, #0, #8 + EOR r5, r5, r7, LSL #8 + UBFX r7, r8, #8, #8 + EOR r5, r5, lr, LSL #16 + UBFX lr, r11, #16, #8 + EOR r5, r5, r2, LSL #24 + LSR r2, r10, #24 + LDRB r6, [r0, r6, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + LSR r11, r11, #24 + EOR r6, r6, r7, LSL #8 + UBFX r7, r10, #0, #8 + EOR r6, r6, lr, LSL #16 + UBFX lr, r9, #8, #8 + EOR r6, r6, r2, LSL #24 + UBFX r2, r8, #16, #8 + LDRB r11, [r0, r11, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + EOR lr, lr, r11, LSL #16 + LDM r3, {r8, r9, r10, r11} + EOR r7, r7, lr, LSL #8 + EOR r7, r7, r2, LSL #16 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_loop_block_256 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_GCMSIV_ctr_thumb2_end +#else + B.W L_AES_GCMSIV_ctr_thumb2_end +#endif +L_AES_GCMSIV_ctr_thumb2_start_block_192: +L_AES_GCMSIV_ctr_thumb2_loop_block_192: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + REV r8, r4 + ADD r8, r8, #0x1 + REV r8, r8 + MOV r9, r5 + MOV r10, r6 + MOV r11, r7 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + BL AES_encrypt_block +#else +L_AES_GCMSIV_ctr_thumb2_block_nr_192: + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r9, #16, #8 + LSR r7, r8, #24 + UBFX lr, r10, #8, #8 + UBFX r2, r11, #0, #8 + LDR r4, [r0, r4, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r5, r10, #16, #8 + EOR r4, r4, r7, ROR #24 + LSR r7, r9, #24 + EOR r4, r4, lr, ROR #8 + UBFX lr, r11, #8, #8 + EOR r4, r4, r2, ROR #16 + UBFX r2, r8, #0, #8 + LDR r5, [r0, r5, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r11, #16, #8 + EOR r5, r5, r7, ROR #24 + LSR r7, r10, #24 + EOR r5, r5, lr, ROR #8 + UBFX lr, r8, #8, #8 + EOR r5, r5, r2, ROR #16 + UBFX r2, r9, #0, #8 + LDR r6, [r0, r6, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r10, #0, #8 + EOR r6, r6, r7, ROR #24 + UBFX r7, r8, #16, #8 + EOR r6, r6, lr, ROR #8 + LSR lr, r11, #24 + EOR r6, r6, r2, ROR #16 + UBFX r2, r9, #8, #8 + LDR r10, [r0, r10, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r10, ROR #24 + LDM r3!, {r8, r9, r10, r11} + EOR r7, r7, lr, ROR #24 + EOR r7, r7, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_block_nr_192 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_192 +#endif + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r11, #0, #8 + UBFX r7, r10, #8, #8 + UBFX lr, r9, #16, #8 + LSR r2, r8, #24 + LDRB r4, [r0, r4, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r5, r8, #0, #8 + EOR r4, r4, r7, LSL #8 + UBFX r7, r11, #8, #8 + EOR r4, r4, lr, LSL #16 + UBFX lr, r10, #16, #8 + EOR r4, r4, r2, LSL #24 + LSR r2, r9, #24 + LDRB r5, [r0, r5, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r6, r9, #0, #8 + EOR r5, r5, r7, LSL #8 + UBFX r7, r8, #8, #8 + EOR r5, r5, lr, LSL #16 + UBFX lr, r11, #16, #8 + EOR r5, r5, r2, LSL #24 + LSR r2, r10, #24 + LDRB r6, [r0, r6, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + LSR r11, r11, #24 + EOR r6, r6, r7, LSL #8 + UBFX r7, r10, #0, #8 + EOR r6, r6, lr, LSL #16 + UBFX lr, r9, #8, #8 + EOR r6, r6, r2, LSL #24 + UBFX r2, r8, #16, #8 + LDRB r11, [r0, r11, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + EOR lr, lr, r11, LSL #16 + LDM r3, {r8, r9, r10, r11} + EOR r7, r7, lr, LSL #8 + EOR r7, r7, r2, LSL #16 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_loop_block_192 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_GCMSIV_ctr_thumb2_end +#else + B.W L_AES_GCMSIV_ctr_thumb2_end +#endif +L_AES_GCMSIV_ctr_thumb2_start_block_128: +L_AES_GCMSIV_ctr_thumb2_loop_block_128: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + REV r8, r4 + ADD r8, r8, #0x1 + REV r8, r8 + MOV r9, r5 + MOV r10, r6 + MOV r11, r7 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + BL AES_encrypt_block +#else +L_AES_GCMSIV_ctr_thumb2_block_nr_128: + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r9, #16, #8 + LSR r7, r8, #24 + UBFX lr, r10, #8, #8 + UBFX r2, r11, #0, #8 + LDR r4, [r0, r4, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r5, r10, #16, #8 + EOR r4, r4, r7, ROR #24 + LSR r7, r9, #24 + EOR r4, r4, lr, ROR #8 + UBFX lr, r11, #8, #8 + EOR r4, r4, r2, ROR #16 + UBFX r2, r8, #0, #8 + LDR r5, [r0, r5, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r11, #16, #8 + EOR r5, r5, r7, ROR #24 + LSR r7, r10, #24 + EOR r5, r5, lr, ROR #8 + UBFX lr, r8, #8, #8 + EOR r5, r5, r2, ROR #16 + UBFX r2, r9, #0, #8 + LDR r6, [r0, r6, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r10, #0, #8 + EOR r6, r6, r7, ROR #24 + UBFX r7, r8, #16, #8 + EOR r6, r6, lr, ROR #8 + LSR lr, r11, #24 + EOR r6, r6, r2, ROR #16 + UBFX r2, r9, #8, #8 + LDR r10, [r0, r10, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r10, ROR #24 + LDM r3!, {r8, r9, r10, r11} + EOR r7, r7, lr, ROR #24 + EOR r7, r7, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_block_nr_128 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_128 +#endif + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r11, #0, #8 + UBFX r7, r10, #8, #8 + UBFX lr, r9, #16, #8 + LSR r2, r8, #24 + LDRB r4, [r0, r4, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r5, r8, #0, #8 + EOR r4, r4, r7, LSL #8 + UBFX r7, r11, #8, #8 + EOR r4, r4, lr, LSL #16 + UBFX lr, r10, #16, #8 + EOR r4, r4, r2, LSL #24 + LSR r2, r9, #24 + LDRB r5, [r0, r5, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r6, r9, #0, #8 + EOR r5, r5, r7, LSL #8 + UBFX r7, r8, #8, #8 + EOR r5, r5, lr, LSL #16 + UBFX lr, r11, #16, #8 + EOR r5, r5, r2, LSL #24 + LSR r2, r10, #24 + LDRB r6, [r0, r6, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + LSR r11, r11, #24 + EOR r6, r6, r7, LSL #8 + UBFX r7, r10, #0, #8 + EOR r6, r6, lr, LSL #16 + UBFX lr, r9, #8, #8 + EOR r6, r6, r2, LSL #24 + UBFX r2, r8, #16, #8 + LDRB r11, [r0, r11, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + EOR lr, lr, r11, LSL #16 + LDM r3, {r8, r9, r10, r11} + EOR r7, r7, lr, LSL #8 + EOR r7, r7, r2, LSL #16 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCMSIV_ctr_thumb2_loop_block_128 +#else + BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_128 +#endif +L_AES_GCMSIV_ctr_thumb2_end: + POP {r3, r8} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 1142 */ + .size AES_GCMSIV_ctr_thumb2,.-AES_GCMSIV_ctr_thumb2 +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !NO_AES */ #endif /* WOLFSSL_ARMASM_THUMB2 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index 6d332507a9..03dd06cee2 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -8009,6 +8009,1746 @@ WC_OMIT_FRAME_POINTER void AES_GCM_encrypt(const unsigned char* in, } #endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AESGCM_SIV +XALIGNED(8) static const word32 L_AES_GCMSIV_polyval_thumb2_r[] = { + 0x00000000, 0x1c200000, 0x38400000, 0x24600000, + 0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000, + 0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000, + 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_thumb2(unsigned char* s_p, + const unsigned char* m_p, const unsigned char* data_p, + unsigned int blocks_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_polyval_thumb2(unsigned char* s, + const unsigned char* m, const unsigned char* data, unsigned int blocks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register unsigned char* s __asm__ ("r0") = (unsigned char*)s_p; + register const unsigned char* m __asm__ ("r1") = (const unsigned char*)m_p; + register const unsigned char* data __asm__ ("r2") = + (const unsigned char*)data_p; + register unsigned int blocks __asm__ ("r3") = (unsigned int)blocks_p; + register word32* L_AES_Thumb2_te_gcm_c __asm__ ("r4") = + (word32*)L_AES_Thumb2_te_gcm; + register word32* L_AES_GCMSIV_polyval_thumb2_r_c __asm__ ("r5") = + (word32*)&L_AES_GCMSIV_polyval_thumb2_r; +#else + register word32* L_AES_Thumb2_te_gcm_c = (word32*)L_AES_Thumb2_te_gcm; + register word32* L_AES_GCMSIV_polyval_thumb2_r_c = + (word32*)&L_AES_GCMSIV_polyval_thumb2_r; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r8, %[L_AES_GCMSIV_polyval_thumb2_r]\n\t" + "CMP %[blocks], #0x0\n\t" +#if defined(__GNUC__) + "BEQ L_AES_GCMSIV_polyval_thumb2_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_GCMSIV_polyval_thumb2_done\n\t" +#else + "BEQ.W L_AES_GCMSIV_polyval_thumb2_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_polyval_thumb2_loop:\n\t" +#else + "L_AES_GCMSIV_polyval_thumb2_loop_%=:\n\t" +#endif + "LDR r12, [%[data], #12]\n\t" + "REV r12, r12\n\t" + "LDR r10, [%[s]]\n\t" + "EOR r10, r10, r12\n\t" + "STR r10, [%[s]]\n\t" + "LDR r12, [%[data], #8]\n\t" + "REV r12, r12\n\t" + "LDR r10, [%[s], #4]\n\t" + "EOR r10, r10, r12\n\t" + "STR r10, [%[s], #4]\n\t" + "LDR r12, [%[data], #4]\n\t" + "REV r12, r12\n\t" + "LDR r10, [%[s], #8]\n\t" + "EOR r10, r10, r12\n\t" + "STR r10, [%[s], #8]\n\t" + "LDR r12, [%[data]]\n\t" + "REV r12, r12\n\t" + "LDR r10, [%[s], #12]\n\t" + "EOR r10, r10, r12\n\t" + "STR r10, [%[s], #12]\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "LDR r9, [%[s], #12]\n\t" + "UBFX r10, r9, #24, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #28, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #16, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #20, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #8, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #12, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #0, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #4, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r9, [%[s], #8]\n\t" + "UBFX r10, r9, #24, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #28, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #16, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #20, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #8, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #12, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #0, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #4, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r9, [%[s], #4]\n\t" + "UBFX r10, r9, #24, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #28, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #16, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #20, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #8, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #12, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #0, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #4, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r9, [%[s]]\n\t" + "UBFX r10, r9, #24, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #28, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #16, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #20, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #8, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #12, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #0, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "AND r10, r6, #0xf\n\t" + "LSR r6, r6, #4\n\t" + "ORR r6, r6, r7, LSL #28\n\t" + "LSR r7, r7, #4\n\t" + "ORR r7, r7, r4, LSL #28\n\t" + "LSR r4, r4, #4\n\t" + "ORR r4, r4, r5, LSL #28\n\t" + "LSR r5, r5, #4\n\t" + "LDR r12, [r8, r10, LSL #2]\n\t" + "EOR r5, r5, r12\n\t" + "UBFX r10, r9, #4, #4\n\t" + "ADD r11, %[m], r10, LSL #4\n\t" + "LDR r12, [r11]\n\t" + "EOR r4, r4, r12\n\t" + "LDR r12, [r11, #4]\n\t" + "EOR r5, r5, r12\n\t" + "LDR r12, [r11, #8]\n\t" + "EOR r6, r6, r12\n\t" + "LDR r12, [r11, #12]\n\t" + "EOR r7, r7, r12\n\t" + "REV r5, r5\n\t" + "REV r4, r4\n\t" + "REV r7, r7\n\t" + "REV r6, r6\n\t" + "STR r5, [%[s]]\n\t" + "STR r4, [%[s], #4]\n\t" + "STR r7, [%[s], #8]\n\t" + "STR r6, [%[s], #12]\n\t" + "SUBS %[blocks], %[blocks], #0x1\n\t" + "ADD %[data], %[data], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_polyval_thumb2_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_polyval_thumb2_loop\n\t" +#else + "BNE.W L_AES_GCMSIV_polyval_thumb2_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_polyval_thumb2_done:\n\t" +#else + "L_AES_GCMSIV_polyval_thumb2_done_%=:\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [s] "+r" (s), [m] "+r" (m), [data] "+r" (data), + [blocks] "+r" (blocks), + [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c), + [L_AES_GCMSIV_polyval_thumb2_r] "+r" (L_AES_GCMSIV_polyval_thumb2_r_c) + : +#else + : + : [s] "r" (s), [m] "r" (m), [data] "r" (data), [blocks] "r" (blocks), + [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm_c), + [L_AES_GCMSIV_polyval_thumb2_r] "r" (L_AES_GCMSIV_polyval_thumb2_r_c) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +XALIGNED(8) static const word32 L_AES_GCMSIV_ctr_thumb2_te_data[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +static const word32* L_AES_GCMSIV_ctr_thumb2_te = L_AES_GCMSIV_ctr_thumb2_te_data; +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_thumb2(const unsigned char* in_p, + unsigned char* out_p, unsigned long length_p, const unsigned char* KS_p, + int nr_p, unsigned char* ctr_p) +#else +WC_OMIT_FRAME_POINTER void AES_GCMSIV_ctr_thumb2(const unsigned char* in, + unsigned char* out, unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = + (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long length __asm__ ("r2") = (unsigned long)length_p; + register const unsigned char* KS __asm__ ("r3") = + (const unsigned char*)KS_p; + register int nr __asm__ ("r4") = (int)nr_p; + register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p; + register word32* L_AES_GCMSIV_ctr_thumb2_te_c __asm__ ("r6") = + (word32*)L_AES_GCMSIV_ctr_thumb2_te; +#else + register word32* L_AES_GCMSIV_ctr_thumb2_te_c = + (word32*)L_AES_GCMSIV_ctr_thumb2_te; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r12, r4\n\t" +#else + "MOV r12, %[nr]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r5\n\t" +#else + "MOV r8, %[ctr]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_GCMSIV_ctr_thumb2_te]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" + "PUSH {%[KS], r8}\n\t" + "CMP r12, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_GCMSIV_ctr_thumb2_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_128\n\t" +#else + "BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_128_%=\n\t" +#endif + "CMP r12, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_GCMSIV_ctr_thumb2_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_192\n\t" +#else + "BEQ.W L_AES_GCMSIV_ctr_thumb2_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_loop_block_256:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_loop_block_256_%=:\n\t" +#endif + "PUSH {r1, %[length], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r8, r4\n\t" + "ADD r8, r8, #0x1\n\t" + "REV r8, r8\n\t" + "MOV r9, r5\n\t" + "MOV r10, r6\n\t" + "MOV r11, r7\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "BL AES_encrypt_block\n\t" +#else + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_block_nr_256:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_block_nr_256_%=:\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r9, #16, #8\n\t" + "LSR r7, r8, #24\n\t" + "UBFX lr, r10, #8, #8\n\t" + "UBFX r2, r11, #0, #8\n\t" + "LDR r4, [r0, r4, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r10, #16, #8\n\t" + "EOR r4, r4, r7, ROR #24\n\t" + "LSR r7, r9, #24\n\t" + "EOR r4, r4, lr, ROR #8\n\t" + "UBFX lr, r11, #8, #8\n\t" + "EOR r4, r4, r2, ROR #16\n\t" + "UBFX r2, r8, #0, #8\n\t" + "LDR r5, [r0, r5, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r11, #16, #8\n\t" + "EOR r5, r5, r7, ROR #24\n\t" + "LSR r7, r10, #24\n\t" + "EOR r5, r5, lr, ROR #8\n\t" + "UBFX lr, r8, #8, #8\n\t" + "EOR r5, r5, r2, ROR #16\n\t" + "UBFX r2, r9, #0, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r10, #0, #8\n\t" + "EOR r6, r6, r7, ROR #24\n\t" + "UBFX r7, r8, #16, #8\n\t" + "EOR r6, r6, lr, ROR #8\n\t" + "LSR lr, r11, #24\n\t" + "EOR r6, r6, r2, ROR #16\n\t" + "UBFX r2, r9, #8, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r10, ROR #24\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, ROR #24\n\t" + "EOR r7, r7, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS r1, r1, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_block_nr_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_256\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_256_%=\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r11, #0, #8\n\t" + "UBFX r7, r10, #8, #8\n\t" + "UBFX lr, r9, #16, #8\n\t" + "LSR r2, r8, #24\n\t" + "LDRB r4, [r0, r4, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r8, #0, #8\n\t" + "EOR r4, r4, r7, LSL #8\n\t" + "UBFX r7, r11, #8, #8\n\t" + "EOR r4, r4, lr, LSL #16\n\t" + "UBFX lr, r10, #16, #8\n\t" + "EOR r4, r4, r2, LSL #24\n\t" + "LSR r2, r9, #24\n\t" + "LDRB r5, [r0, r5, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r9, #0, #8\n\t" + "EOR r5, r5, r7, LSL #8\n\t" + "UBFX r7, r8, #8, #8\n\t" + "EOR r5, r5, lr, LSL #16\n\t" + "UBFX lr, r11, #16, #8\n\t" + "EOR r5, r5, r2, LSL #24\n\t" + "LSR r2, r10, #24\n\t" + "LDRB r6, [r0, r6, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "LSR r11, r11, #24\n\t" + "EOR r6, r6, r7, LSL #8\n\t" + "UBFX r7, r10, #0, #8\n\t" + "EOR r6, r6, lr, LSL #16\n\t" + "UBFX lr, r9, #8, #8\n\t" + "EOR r6, r6, r2, LSL #24\n\t" + "UBFX r2, r8, #16, #8\n\t" + "LDRB r11, [r0, r11, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r11, LSL #16\n\t" + "LDM %[KS], {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, LSL #8\n\t" + "EOR r7, r7, r2, LSL #16\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "POP {r1, %[length], lr}\n\t" + "LDR %[KS], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[length], %[length], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_256\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_GCMSIV_ctr_thumb2_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_GCMSIV_ctr_thumb2_end\n\t" +#else + "B.W L_AES_GCMSIV_ctr_thumb2_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_start_block_192:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_loop_block_192:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_loop_block_192_%=:\n\t" +#endif + "PUSH {r1, %[length], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r8, r4\n\t" + "ADD r8, r8, #0x1\n\t" + "REV r8, r8\n\t" + "MOV r9, r5\n\t" + "MOV r10, r6\n\t" + "MOV r11, r7\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "BL AES_encrypt_block\n\t" +#else + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_block_nr_192:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_block_nr_192_%=:\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r9, #16, #8\n\t" + "LSR r7, r8, #24\n\t" + "UBFX lr, r10, #8, #8\n\t" + "UBFX r2, r11, #0, #8\n\t" + "LDR r4, [r0, r4, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r10, #16, #8\n\t" + "EOR r4, r4, r7, ROR #24\n\t" + "LSR r7, r9, #24\n\t" + "EOR r4, r4, lr, ROR #8\n\t" + "UBFX lr, r11, #8, #8\n\t" + "EOR r4, r4, r2, ROR #16\n\t" + "UBFX r2, r8, #0, #8\n\t" + "LDR r5, [r0, r5, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r11, #16, #8\n\t" + "EOR r5, r5, r7, ROR #24\n\t" + "LSR r7, r10, #24\n\t" + "EOR r5, r5, lr, ROR #8\n\t" + "UBFX lr, r8, #8, #8\n\t" + "EOR r5, r5, r2, ROR #16\n\t" + "UBFX r2, r9, #0, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r10, #0, #8\n\t" + "EOR r6, r6, r7, ROR #24\n\t" + "UBFX r7, r8, #16, #8\n\t" + "EOR r6, r6, lr, ROR #8\n\t" + "LSR lr, r11, #24\n\t" + "EOR r6, r6, r2, ROR #16\n\t" + "UBFX r2, r9, #8, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r10, ROR #24\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, ROR #24\n\t" + "EOR r7, r7, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS r1, r1, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_block_nr_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_192\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_192_%=\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r11, #0, #8\n\t" + "UBFX r7, r10, #8, #8\n\t" + "UBFX lr, r9, #16, #8\n\t" + "LSR r2, r8, #24\n\t" + "LDRB r4, [r0, r4, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r8, #0, #8\n\t" + "EOR r4, r4, r7, LSL #8\n\t" + "UBFX r7, r11, #8, #8\n\t" + "EOR r4, r4, lr, LSL #16\n\t" + "UBFX lr, r10, #16, #8\n\t" + "EOR r4, r4, r2, LSL #24\n\t" + "LSR r2, r9, #24\n\t" + "LDRB r5, [r0, r5, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r9, #0, #8\n\t" + "EOR r5, r5, r7, LSL #8\n\t" + "UBFX r7, r8, #8, #8\n\t" + "EOR r5, r5, lr, LSL #16\n\t" + "UBFX lr, r11, #16, #8\n\t" + "EOR r5, r5, r2, LSL #24\n\t" + "LSR r2, r10, #24\n\t" + "LDRB r6, [r0, r6, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "LSR r11, r11, #24\n\t" + "EOR r6, r6, r7, LSL #8\n\t" + "UBFX r7, r10, #0, #8\n\t" + "EOR r6, r6, lr, LSL #16\n\t" + "UBFX lr, r9, #8, #8\n\t" + "EOR r6, r6, r2, LSL #24\n\t" + "UBFX r2, r8, #16, #8\n\t" + "LDRB r11, [r0, r11, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r11, LSL #16\n\t" + "LDM %[KS], {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, LSL #8\n\t" + "EOR r7, r7, r2, LSL #16\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "POP {r1, %[length], lr}\n\t" + "LDR %[KS], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[length], %[length], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_192\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_GCMSIV_ctr_thumb2_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_GCMSIV_ctr_thumb2_end\n\t" +#else + "B.W L_AES_GCMSIV_ctr_thumb2_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_start_block_128:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_loop_block_128:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_loop_block_128_%=:\n\t" +#endif + "PUSH {r1, %[length], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r8, r4\n\t" + "ADD r8, r8, #0x1\n\t" + "REV r8, r8\n\t" + "MOV r9, r5\n\t" + "MOV r10, r6\n\t" + "MOV r11, r7\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" +#ifndef WOLFSSL_ARMASM_AES_BLOCK_INLINE + "BL AES_encrypt_block\n\t" +#else + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_block_nr_128:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_block_nr_128_%=:\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r9, #16, #8\n\t" + "LSR r7, r8, #24\n\t" + "UBFX lr, r10, #8, #8\n\t" + "UBFX r2, r11, #0, #8\n\t" + "LDR r4, [r0, r4, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r10, #16, #8\n\t" + "EOR r4, r4, r7, ROR #24\n\t" + "LSR r7, r9, #24\n\t" + "EOR r4, r4, lr, ROR #8\n\t" + "UBFX lr, r11, #8, #8\n\t" + "EOR r4, r4, r2, ROR #16\n\t" + "UBFX r2, r8, #0, #8\n\t" + "LDR r5, [r0, r5, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r11, #16, #8\n\t" + "EOR r5, r5, r7, ROR #24\n\t" + "LSR r7, r10, #24\n\t" + "EOR r5, r5, lr, ROR #8\n\t" + "UBFX lr, r8, #8, #8\n\t" + "EOR r5, r5, r2, ROR #16\n\t" + "UBFX r2, r9, #0, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r10, #0, #8\n\t" + "EOR r6, r6, r7, ROR #24\n\t" + "UBFX r7, r8, #16, #8\n\t" + "EOR r6, r6, lr, ROR #8\n\t" + "LSR lr, r11, #24\n\t" + "EOR r6, r6, r2, ROR #16\n\t" + "UBFX r2, r9, #8, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r7, [r0, r7, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r10, ROR #24\n\t" + "LDM %[KS]!, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, ROR #24\n\t" + "EOR r7, r7, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS r1, r1, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_block_nr_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_128\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_block_nr_128_%=\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [r0, r8, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [r0, r9, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [r0, r10, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [r0, r6, LSL #2]\n\t" + "LDR lr, [r0, lr, LSL #2]\n\t" + "LDR r11, [r0, r11, LSL #2]\n\t" + "LDR r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[KS]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r11, #0, #8\n\t" + "UBFX r7, r10, #8, #8\n\t" + "UBFX lr, r9, #16, #8\n\t" + "LSR r2, r8, #24\n\t" + "LDRB r4, [r0, r4, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r5, r8, #0, #8\n\t" + "EOR r4, r4, r7, LSL #8\n\t" + "UBFX r7, r11, #8, #8\n\t" + "EOR r4, r4, lr, LSL #16\n\t" + "UBFX lr, r10, #16, #8\n\t" + "EOR r4, r4, r2, LSL #24\n\t" + "LSR r2, r9, #24\n\t" + "LDRB r5, [r0, r5, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "UBFX r6, r9, #0, #8\n\t" + "EOR r5, r5, r7, LSL #8\n\t" + "UBFX r7, r8, #8, #8\n\t" + "EOR r5, r5, lr, LSL #16\n\t" + "UBFX lr, r11, #16, #8\n\t" + "EOR r5, r5, r2, LSL #24\n\t" + "LSR r2, r10, #24\n\t" + "LDRB r6, [r0, r6, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "LSR r11, r11, #24\n\t" + "EOR r6, r6, r7, LSL #8\n\t" + "UBFX r7, r10, #0, #8\n\t" + "EOR r6, r6, lr, LSL #16\n\t" + "UBFX lr, r9, #8, #8\n\t" + "EOR r6, r6, r2, LSL #24\n\t" + "UBFX r2, r8, #16, #8\n\t" + "LDRB r11, [r0, r11, LSL #2]\n\t" + "LDRB r7, [r0, r7, LSL #2]\n\t" + "LDRB lr, [r0, lr, LSL #2]\n\t" + "LDRB r2, [r0, r2, LSL #2]\n\t" + "EOR lr, lr, r11, LSL #16\n\t" + "LDM %[KS], {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, LSL #8\n\t" + "EOR r7, r7, r2, LSL #16\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" +#endif /* !WOLFSSL_ARMASM_AES_BLOCK_INLINE */ + "POP {r1, %[length], lr}\n\t" + "LDR %[KS], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[length], %[length], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCMSIV_ctr_thumb2_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_128\n\t" +#else + "BNE.W L_AES_GCMSIV_ctr_thumb2_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCMSIV_ctr_thumb2_end:\n\t" +#else + "L_AES_GCMSIV_ctr_thumb2_end_%=:\n\t" +#endif + "POP {%[KS], r8}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [length] "+r" (length), + [KS] "+r" (KS), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_GCMSIV_ctr_thumb2_te] "+r" (L_AES_GCMSIV_ctr_thumb2_te_c) + : +#else + : + : [in] "r" (in), [out] "r" (out), [length] "r" (length), [KS] "r" (KS), + [nr] "r" (nr), [ctr] "r" (ctr), + [L_AES_GCMSIV_ctr_thumb2_te] "r" (L_AES_GCMSIV_ctr_thumb2_te_c) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "cc", "r12", "lr", "r7", "r8", "r9", "r10", "r11" + ); +} + +#endif /* WOLFSSL_AESGCM_SIV */ #endif /* !NO_AES */ #endif /* WOLFSSL_ARMASM_THUMB2 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 49deeaf0ce..c5eb562551 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -877,6 +877,9 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t siphash_test(void); WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void); WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_test(void); WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_default_test(void); +#ifdef WOLFSSL_AESGCM_SIV +WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_siv_test(void); +#endif WOLFSSL_TEST_SUBROUTINE wc_test_ret_t gmac_test(void); WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesccm_test(void); WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aeskeywrap_test(void); @@ -2857,6 +2860,12 @@ options: [-s max_relative_stack_bytes] [-m max_relative_heap_memory_bytes]\n\ if (ret == 0) { TEST_PASS("AES-GCM test passed!\n"); } +#ifdef WOLFSSL_AESGCM_SIV + if ((ret = aesgcm_siv_test()) != 0) + TEST_FAIL("AES-GCM-SIV test failed!\n", ret); + else + TEST_PASS("AES-GCM-SIV test passed!\n"); +#endif #endif #if defined(HAVE_AESCCM) && defined(WOLFSSL_AES_128) @@ -20250,6 +20259,108 @@ static wc_test_ret_t aesgcm_aes256_large_test(Aes* enc, Aes* dec) } #endif /* WOLFSSL_AES_256 */ +#ifdef WOLFSSL_AESGCM_SIV +/* AES-GCM-SIV (RFC 8452) self test: known-answer vectors from RFC 8452 + * Appendix C (AES-128 C.1-2 and AES-256 C.2-2), an encrypt/decrypt round trip, + * authentication-failure handling and invalid-parameter spot checks. The + * exhaustive coverage lives in tests/api/test_aes.c. */ +WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_siv_test(void) +{ + wc_test_ret_t ret = 0; + WOLFSSL_SMALL_STACK_STATIC const byte key128[16] = { + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + }; + WOLFSSL_SMALL_STACK_STATIC const byte nonce[12] = { + 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + }; + WOLFSSL_SMALL_STACK_STATIC const byte pt8[8] = { + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + }; + WOLFSSL_SMALL_STACK_STATIC const byte exp128[8] = { + 0xb5,0xd8,0x39,0x33,0x0a,0xc7,0xb7,0x86 + }; + WOLFSSL_SMALL_STACK_STATIC const byte tag128[16] = { + 0x57,0x87,0x82,0xff,0xf6,0x01,0x3b,0x81, + 0x5b,0x28,0x7c,0x22,0x49,0x3a,0x36,0x4c + }; +#ifdef WOLFSSL_AES_256 + WOLFSSL_SMALL_STACK_STATIC const byte key256[32] = { + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + }; + WOLFSSL_SMALL_STACK_STATIC const byte exp256[8] = { + 0xc2,0xef,0x32,0x8e,0x5c,0x71,0xc8,0x3b + }; + WOLFSSL_SMALL_STACK_STATIC const byte tag256[16] = { + 0x84,0x31,0x22,0x13,0x0f,0x73,0x64,0xb7, + 0x61,0xe0,0xb9,0x74,0x27,0xe3,0xdf,0x28 + }; +#endif + byte ct[8]; + byte tag[16]; + byte dec[8]; + + /* AES-128 known-answer (encrypt). */ + ret = wc_AesGcmSivEncrypt(key128, sizeof(key128), nonce, sizeof(nonce), + NULL, 0, pt8, sizeof(pt8), ct, tag, sizeof(tag)); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(ct, exp128, sizeof(exp128)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + if (XMEMCMP(tag, tag128, sizeof(tag128)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + /* Decrypt round trip. */ + ret = wc_AesGcmSivDecrypt(key128, sizeof(key128), nonce, sizeof(nonce), + NULL, 0, ct, sizeof(ct), dec, tag, sizeof(tag)); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(dec, pt8, sizeof(pt8)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + /* Tampered tag must fail authentication. */ + tag[0] ^= 0xff; + if (wc_AesGcmSivDecrypt(key128, sizeof(key128), nonce, sizeof(nonce), + NULL, 0, ct, sizeof(ct), dec, tag, sizeof(tag)) != + WC_NO_ERR_TRACE(AES_GCM_AUTH_E)) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + tag[0] ^= 0xff; + +#ifdef WOLFSSL_AES_256 + /* AES-256 known-answer + round trip. */ + ret = wc_AesGcmSivEncrypt(key256, sizeof(key256), nonce, sizeof(nonce), + NULL, 0, pt8, sizeof(pt8), ct, tag, sizeof(tag)); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(ct, exp256, sizeof(exp256)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + if (XMEMCMP(tag, tag256, sizeof(tag256)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + ret = wc_AesGcmSivDecrypt(key256, sizeof(key256), nonce, sizeof(nonce), + NULL, 0, ct, sizeof(ct), dec, tag, sizeof(tag)); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(dec, pt8, sizeof(pt8)) != 0) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); +#endif /* WOLFSSL_AES_256 */ + + /* Invalid-parameter spot checks (NULL key; AES-192 key size rejected). */ + if (wc_AesGcmSivEncrypt(NULL, sizeof(key128), nonce, sizeof(nonce), + NULL, 0, pt8, sizeof(pt8), ct, tag, sizeof(tag)) != + WC_NO_ERR_TRACE(BAD_FUNC_ARG)) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + if (wc_AesGcmSivEncrypt(key128, 24, nonce, sizeof(nonce), + NULL, 0, pt8, sizeof(pt8), ct, tag, sizeof(tag)) != + WC_NO_ERR_TRACE(BAD_FUNC_ARG)) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + + ret = 0; +out: + return ret; +} +#endif /* WOLFSSL_AESGCM_SIV */ + WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_test(void) { #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC) diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index e3d7637470..45b628c5d9 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -822,6 +822,30 @@ int wc_AesSivDecrypt_ex(const byte* key, word32 keySz, const AesSivAssoc* assoc, const byte* in, word32 inSz, byte* siv, byte* out); #endif +#ifdef WOLFSSL_AESGCM_SIV +/* AES-GCM-SIV (RFC 8452): nonce-misuse resistant AEAD. + * key : key-generating-key, 16 (AES-128) or 32 (AES-256) bytes. + * nonce : 12 bytes. + * tag : 16 bytes (the RFC 8452 authentication tag). + * The encrypted output is the same length as the plaintext; the tag is + * returned separately. + * + * The POLYVAL hash is constant-time wherever the CPU provides carry-less + * multiply (x86 PCLMUL, Arm PMULL/VMULL) - the runtime default on such CPUs. + * Software-only builds fall back to a key-dependent 4-bit table (a cache-timing + * trade-off matching GCM_TABLE GHASH); GCM_SMALL avoids the table entirely. */ +WOLFSSL_API +int wc_AesGcmSivEncrypt(const byte* key, word32 keySz, const byte* nonce, + word32 nonceSz, const byte* aad, word32 aadSz, + const byte* in, word32 inSz, byte* out, + byte* tag, word32 tagSz); +WOLFSSL_API WARN_UNUSED_RESULT +int wc_AesGcmSivDecrypt(const byte* key, word32 keySz, const byte* nonce, + word32 nonceSz, const byte* aad, word32 aadSz, + const byte* in, word32 inSz, byte* out, + const byte* tag, word32 tagSz); +#endif /* WOLFSSL_AESGCM_SIV */ + #ifdef WOLFSSL_CMAC /* forward declaration, in case aes.h is being included by cmac.h */ struct Cmac;