From e29c76511bffeda0f70b54ed9ce08d6365cf812f Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 29 Jun 2026 10:59:13 -0700 Subject: [PATCH 1/7] Fix TLS 1.3 hybrid PQC server key share dropped under async crypt --- src/tls.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/tls.c b/src/tls.c index ace0f7f979a..80cbe448e39 100644 --- a/src/tls.c +++ b/src/tls.c @@ -11020,11 +11020,10 @@ int TLSX_KeyShare_HandlePqcHybridKeyServer(WOLFSSL* ssl, #ifdef WOLFSSL_ASYNC_CRYPT if (ret == 0) { - /* Check if the provided kse already contains ECC data and the - * last error was WC_PENDING_E. In this case, we already tried to - * process ECC kse data. Hence, we have to restore it. */ - if (keyShareEntry->key != NULL && keyShareEntry->keyLen > 0 && - keyShareEntry->lastRet == WC_NO_ERR_TRACE(WC_PENDING_E)) { + /* Restore ECC state from a prior suspended pass. Not gated on + * lastRet == WC_PENDING_E: the async layer clears lastRet to 0 on + * completion, which would skip the restore and regenerate the key. */ + if (keyShareEntry->key != NULL && keyShareEntry->keyLen > 0) { ecc_kse->key = keyShareEntry->key; ecc_kse->keyLen = keyShareEntry->keyLen; ecc_kse->pubKey = keyShareEntry->pubKey; @@ -11143,6 +11142,7 @@ int TLSX_KeyShare_HandlePqcHybridKeyServer(WOLFSSL* ssl, else if (ret == WC_NO_ERR_TRACE(WC_PENDING_E)) { keyShareEntry->lastRet = WC_PENDING_E; keyShareEntry->key = ecc_kse->key; + keyShareEntry->keyLen = ecc_kse->keyLen; keyShareEntry->pubKey = ecc_kse->pubKey; keyShareEntry->pubKeyLen = ecc_kse->pubKeyLen; ecc_kse->key = NULL; @@ -11178,6 +11178,12 @@ int TLSX_KeyShare_HandlePqcHybridKeyServer(WOLFSSL* ssl, ssl->arrays->preMasterSz += ssSzPqc; keyShareEntry->ke = NULL; keyShareEntry->keLen = 0; + #ifdef WOLFSSL_ASYNC_CRYPT + /* Hybrid encapsulation is fully complete here. Clear the pending + * state so the TLS_ASYNC_VERIFY re-drive is skipped and does not + * re-enter this handler with the now-freed ke. */ + keyShareEntry->lastRet = 0; + #endif /* Concatenate the ECDH public key and the PQC KEM ciphertext. Based on * the pqc_first flag, the ECDH public key goes before or after the KEM @@ -11964,6 +11970,18 @@ int TLSX_KeyShare_Setup(WOLFSSL *ssl, KeyShareEntry* clientKSE) if (extension != NULL && extension->resp == 1) { serverKSE = (KeyShareEntry*)extension->data; if (serverKSE != NULL) { +#if defined(WOLFSSL_HAVE_MLKEM) && !defined(WOLFSSL_MLKEM_NO_ENCAPSULATE) + /* Re-drive server hybrid encapsulation on resume. GenKey + * routes a hybrid group to the client generator, and the + * lastRet == 0 path treats the share as done after only the + * ECDH part completed, dropping the KEM ciphertext. ke holds + * the client share until the handler completes and clears it. */ + if (serverKSE->ke != NULL && + WOLFSSL_NAMED_GROUP_IS_PQC_HYBRID(serverKSE->group)) { + return TLSX_KeyShare_HandlePqcHybridKeyServer((WOLFSSL*)ssl, + serverKSE, serverKSE->ke, serverKSE->keLen); + } +#endif /* in async case make sure key generation is finalized */ if (serverKSE->lastRet == WC_NO_ERR_TRACE(WC_PENDING_E)) return TLSX_KeyShare_GenKey((WOLFSSL*)ssl, serverKSE); From 99c8ce2231a42f2dd40b3cab002d7273627c42ab Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 29 Jun 2026 10:59:13 -0700 Subject: [PATCH 2/7] Fix Cavium async event queue req_count buffer overflow --- wolfcrypt/src/async.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wolfcrypt/src/async.c b/wolfcrypt/src/async.c index 5048ddc439a..7ae35b39012 100644 --- a/wolfcrypt/src/async.c +++ b/wolfcrypt/src/async.c @@ -686,6 +686,9 @@ int wolfAsync_EventQueuePoll(WOLF_EVENT_QUEUE* queue, void* context_filter, if (ret != 0) { break; } + /* buffer flushed: restart indexing to avoid writing + * past multi_req.req[CAVIUM_MAX_POLL] */ + req_count = 0; } #else #if defined(HAVE_INTEL_QA) From 60194562f6a400c5a8a637f947f9312363ef1387 Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 29 Jun 2026 10:59:35 -0700 Subject: [PATCH 3/7] Intel QAT: fall back to regular memory when crypto service not started --- wolfcrypt/src/port/intel/quickassist.c | 7 +++++++ wolfcrypt/src/port/intel/quickassist_mem.c | 14 ++++++++++++-- wolfssl/wolfcrypt/port/intel/quickassist_mem.h | 4 ++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/port/intel/quickassist.c b/wolfcrypt/src/port/intel/quickassist.c index bd3c84b81c9..cff5381b160 100644 --- a/wolfcrypt/src/port/intel/quickassist.c +++ b/wolfcrypt/src/port/intel/quickassist.c @@ -323,6 +323,13 @@ void IntelQaHardwareStop(void) printf("IntelQA: Stop\n"); } +/* Returns nonzero when the QAT crypto service is running. Lets the memory + * layer fall back to regular memory when the service is not started. */ +int IntelQaIsStarted(void) +{ + return (g_cyServiceStarted == CPA_TRUE) ? 1 : 0; +} + int IntelQaHardwareStart(const char* process_name, int limitDevAccess) { int ret = 0, i; diff --git a/wolfcrypt/src/port/intel/quickassist_mem.c b/wolfcrypt/src/port/intel/quickassist_mem.c index 7d44572a930..f58f6d3fa5c 100644 --- a/wolfcrypt/src/port/intel/quickassist_mem.c +++ b/wolfcrypt/src/port/intel/quickassist_mem.c @@ -410,6 +410,14 @@ static void* _qaeMemAlloc(size_t size, void* heap, int type ptr = qaeMemAllocNUMA((Cpa32U)(size + sizeof(qaeMemHeader)), 0, alignment, &page_offset); #endif + /* Service not up (e.g. "Running without async"): fall back to regular + * memory so software crypto can proceed. A NULL while it IS up means + * real NUMA exhaustion and stays NULL so the QAT op fails cleanly. */ + if (ptr == NULL && !IntelQaIsStarted()) { + isNuma = 0; + page_offset = QAE_NOT_NUMA_PAGE; + ptr = malloc(size + sizeof(qaeMemHeader)); + } } else { isNuma = 0; @@ -611,8 +619,10 @@ void* IntelQaRealloc(void *ptr, size_t size, void* heap, int type copySize = size; XMEMCPY(newPtr, ptr, copySize); - if (newIsNuma == 0 && ptrIsNuma == 0) { - /* for non-NUMA, treat as normal REALLOC and free old pointer */ + if (ptrIsNuma == 0) { + /* old pointer is a qae-headed non-NUMA buffer (software + * fallback) -- free it. Caller-owned (ptrIsNuma == -1) and + * NUMA (== 1) buffers are handled by their own paths. */ _qaeMemFree(ptr, heap, type #ifdef WOLFSSL_DEBUG_MEMORY , func, line diff --git a/wolfssl/wolfcrypt/port/intel/quickassist_mem.h b/wolfssl/wolfcrypt/port/intel/quickassist_mem.h index d5b569548d9..a52ddb4b611 100644 --- a/wolfssl/wolfcrypt/port/intel/quickassist_mem.h +++ b/wolfssl/wolfcrypt/port/intel/quickassist_mem.h @@ -59,6 +59,10 @@ WOLFSSL_API void* IntelQaRealloc(void *ptr, size_t size, void* heap, int type #endif ); +/* Nonzero when the QAT crypto service is running (defined in quickassist.c). + * Lets the memory layer fall back to regular memory when QAT is not up. */ +WOLFSSL_LOCAL int IntelQaIsStarted(void); + #endif /* HAVE_INTEL_QA */ #endif /* _QUICKASSIST_MEM_H_ */ From 7965e09386b63181a047871963168902a90b0f26 Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 29 Jun 2026 10:59:55 -0700 Subject: [PATCH 4/7] Intel QAT: fix heap argument in RSA public free --- wolfcrypt/src/port/intel/quickassist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolfcrypt/src/port/intel/quickassist.c b/wolfcrypt/src/port/intel/quickassist.c index cff5381b160..4832f023d24 100644 --- a/wolfcrypt/src/port/intel/quickassist.c +++ b/wolfcrypt/src/port/intel/quickassist.c @@ -1737,7 +1737,7 @@ static void IntelQaRsaPublicFree(WC_ASYNC_DEV* dev) } if (outBuf) { if (outBuf->pData) { - XFREE(outBuf->pData, dev, DYNAMIC_TYPE_ASYNC_NUMA64); + XFREE(outBuf->pData, dev->heap, DYNAMIC_TYPE_ASYNC_NUMA64); outBuf->pData = NULL; } XMEMSET(outBuf, 0, sizeof(CpaFlatBuffer)); From 90b0e03e822afcedde7d4914a5c9508e4d534004 Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 29 Jun 2026 10:59:55 -0700 Subject: [PATCH 5/7] Intel QAT: interleave instances across devices for multi-device utilization --- wolfcrypt/src/port/intel/quickassist.c | 89 +++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/port/intel/quickassist.c b/wolfcrypt/src/port/intel/quickassist.c index 4832f023d24..585d6808781 100644 --- a/wolfcrypt/src/port/intel/quickassist.c +++ b/wolfcrypt/src/port/intel/quickassist.c @@ -323,13 +323,93 @@ void IntelQaHardwareStop(void) printf("IntelQA: Stop\n"); } -/* Returns nonzero when the QAT crypto service is running. Lets the memory - * layer fall back to regular memory when the service is not started. */ +/* Returns nonzero when the QAT crypto service is running. The memory layer + * uses this to decide whether a failed NUMA allocation should fall back to + * regular memory (service not started -> software mode) or remain NULL (real + * NUMA exhaustion while the device is in use). */ int IntelQaIsStarted(void) { return (g_cyServiceStarted == CPA_TRUE) ? 1 : 0; } +#ifndef QAT_NO_DEV_INTERLEAVE +/* Reorder the instance handle list so instances interleave across QAT + * devices (packages). cpaCyGetInstances() returns them grouped by device + * (all of dev0, then dev1, ...). The per-thread round-robin assignment in + * IntelQaInit() (devId = g_instCounter % g_numInstances) would then pile a + * thread count lower than the instance count onto the first device(s), + * leaving later devices idle. Interleaving makes consecutive devIds land on + * different devices, so low/mid thread counts spread over every device. + * Order is unchanged when only one device is present, and at thread counts + * >= g_numInstances every instance is used either way. Non-fatal: on any + * failure the original order is kept. */ +static void IntelQaInterleaveInstances(CpaInstanceHandle* instances, + Cpa16U count) +{ + CpaInstanceHandle* sorted; + CpaInstanceInfo2* info; + Cpa16U* pkg; + CpaStatus status; + int i, p, idx, seen, round, maxPkg, out; + + if (instances == NULL || count <= 1) { + return; + } + + sorted = (CpaInstanceHandle*)XMALLOC(sizeof(CpaInstanceHandle) * count, + NULL, DYNAMIC_TYPE_ASYNC); + pkg = (Cpa16U*)XMALLOC(sizeof(Cpa16U) * count, NULL, DYNAMIC_TYPE_ASYNC); + info = (CpaInstanceInfo2*)XMALLOC(sizeof(CpaInstanceInfo2), NULL, + DYNAMIC_TYPE_ASYNC); + if (sorted == NULL || pkg == NULL || info == NULL) { + XFREE(sorted, NULL, DYNAMIC_TYPE_ASYNC); + XFREE(pkg, NULL, DYNAMIC_TYPE_ASYNC); + XFREE(info, NULL, DYNAMIC_TYPE_ASYNC); + return; + } + + /* record each instance's device (package) id */ + maxPkg = 0; + for (i = 0; i < (int)count; i++) { + status = cpaCyInstanceGetInfo2(instances[i], info); + pkg[i] = (status == CPA_STATUS_SUCCESS) ? + info->physInstId.packageId : 0; + if ((int)pkg[i] > maxPkg) { + maxPkg = (int)pkg[i]; + } + } + + /* emit one instance per device per round: dev0#0, dev1#0, dev2#0, + * dev0#1, dev1#1, ... */ + out = 0; + for (round = 0; round < (int)count && out < (int)count; round++) { + for (p = 0; p <= maxPkg && out < (int)count; p++) { + seen = 0; + for (idx = 0; idx < (int)count; idx++) { + if ((int)pkg[idx] == p) { + if (seen == round) { + sorted[out++] = instances[idx]; + break; + } + seen++; + } + } + } + } + + /* only apply if every instance was placed exactly once */ + if (out == (int)count) { + for (i = 0; i < (int)count; i++) { + instances[i] = sorted[i]; + } + } + + XFREE(sorted, NULL, DYNAMIC_TYPE_ASYNC); + XFREE(pkg, NULL, DYNAMIC_TYPE_ASYNC); + XFREE(info, NULL, DYNAMIC_TYPE_ASYNC); +} +#endif /* QAT_NO_DEV_INTERLEAVE */ + int IntelQaHardwareStart(const char* process_name, int limitDevAccess) { int ret = 0, i; @@ -414,6 +494,11 @@ int IntelQaHardwareStart(const char* process_name, int limitDevAccess) ret = INVALID_DEVID; goto error; } +#ifndef QAT_NO_DEV_INTERLEAVE + /* spread instances across devices for better multi-device utilization */ + IntelQaInterleaveInstances(g_cyInstances, g_numInstances); +#endif + /* start all instances */ g_cyServiceStarted = CPA_TRUE; for (i=0; i Date: Mon, 29 Jun 2026 11:00:01 -0700 Subject: [PATCH 6/7] Intel QAT: serialize build/check under --with-intelqa; update port README --- Makefile.am | 12 + wolfcrypt/src/port/intel/README.md | 367 +++++++---------------------- 2 files changed, 103 insertions(+), 276 deletions(-) diff --git a/Makefile.am b/Makefile.am index 4f3f8fce0c9..9144c0c717c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -25,6 +25,18 @@ ignore_files = SUBDIRS_OPT = DIST_SUBDIRS_OPT = +# Serialize the build when Intel QuickAssist is enabled. Concurrent QAT user +# processes (the parallel test binaries that `make -j check` launches) exhaust +# the device's crypto instances and usdm contiguous memory, so the test phase +# must run serially. With the non-recursive build this disables -j for the +# whole invocation, which also matches the QAT driver's build guidance. +if BUILD_INTEL_QA +.NOTPARALLEL: +endif +if BUILD_INTEL_QA_SYNC +.NOTPARALLEL: +endif + # allow supplementary or override flags to be passed at make time: AM_CPPFLAGS += $(EXTRA_CPPFLAGS) AM_CFLAGS += $(EXTRA_CFLAGS) diff --git a/wolfcrypt/src/port/intel/README.md b/wolfcrypt/src/port/intel/README.md index 4d8aefd526b..0ce6dfcf310 100644 --- a/wolfcrypt/src/port/intel/README.md +++ b/wolfcrypt/src/port/intel/README.md @@ -64,9 +64,70 @@ make ## Usage -Running wolfCrypt test and benchmark must be done with `sudo` to allow hardware access. By default the QuickAssist code uses the "SSL" process name via `QAT_PROCESS_NAME` in quickassist.h to match up to the hardware configuration. +Running wolfCrypt test and benchmark requires access to the QAT hardware. By default the QuickAssist code uses the "SSL" process name via `QAT_PROCESS_NAME` in quickassist.h to match up to the hardware configuration. The device configuration file is per device and named for the device type, for example `/etc/c6xx_dev0.conf` (older docs reference `dh895xcc_qa_dev0.conf`). -Note: `sudo make check` will fail since default QAT configuration doesn't allow multiple concurrent processes to use hardware. You can run each of the make check scripts individually with sudo. The hardware configuration can be customized by editing the `QAT/build/dh895xcc_qa_dev0.conf` file to allow multiple processes. +### Running without sudo + +Recent QAT driver installs ship a udev rule (`/etc/udev/rules.d/00-qat.rules`) that assigns the `qat` group to `/dev/usdm_drv`, `/dev/qat_dev_processes`, `/dev/qat_adf_ctl` and `/dev/hugepages/qat`. To use the hardware as a normal user, add yourself to that group and start a fresh login shell: + +```sh +sudo usermod -aG qat $USER +# log out/in (or 'newgrp qat'), then verify: +ls -l /dev/usdm_drv /dev/qat_dev_processes # should be group 'qat', mode 0660 +./wolfcrypt/test/testwolfcrypt # no sudo; prints 'IntelQA: Instances N' +``` + +If `testwolfcrypt` prints `Could not start qae mem for user space (status -2)` and `Running without async`, the usdm memory driver is not accessible (group not applied yet, or the `usdm_drv` module is not loaded). Crypto then runs in software (the QAT NUMA allocator falls back to regular memory when the device is not up); bring the driver up to get hardware acceleration. + +### make check + +When configured `--with-intelqa`, the build (and so `make check`) is serialized automatically (`.NOTPARALLEL` in `Makefile.am`), because the QAT driver cannot serve the many concurrent user processes a `make -j` test run would launch. A single test still runs a server and a client, so raise `NumProcesses` in the `[SSL]` section of each `/etc/_dev.conf` (default `1`, e.g. to 3) and restart the driver, then run `make check` normally: + +```sh +sudo systemctl restart qat # clean usdm state +make check +``` + +Note: against a healthy QAT (boot-time hugepages reserved; see the diagnostics +section below) `testsuite/testsuite.test` passes with all instances up and no +`-173`. The remaining `resume.test` / `unit.test` flakiness is QAT +contiguous-memory exhaustion, not a code defect: each test launches a fresh +server+client, and once the usdm pool fragments those processes fail SAL init +(`Lac_MemPoolCreate ... contiguous chunk`) and fall back to software or time +out. Runtime `vm.nr_hugepages` cannot reliably reserve a large pool once memory +is fragmented, so boot-time hugepages are required for back-to-back QAT tests. + +Two real code bugs were found and fixed in this change: the software-fallback +NUMA allocation bug (`-142`/`-140`, crypto failed instead of running in +software when the device could not be opened), and a TLS 1.3 hybrid PQC server +key-share async bug that produced `SSL_connect -173`. The latter only triggers +when the ECDH key generation completes synchronously and only the shared-secret +offloads/suspends ("B-first" ordering, which is what QAT does): the server then +dropped the ML-KEM ciphertext and sent only the 65-byte ECDH public key. To +reproduce deterministically without QAT hardware, build +`--enable-asynccrypt --enable-asynccrypt-sw` and make `wc_AsyncSwInit()` return +0 for `ASYNC_SW_ECC_MAKE` (keygen synchronous, shared-secret still suspends), +then run a hybrid-group (e.g. P256+ML-KEM-768) TLS 1.3 handshake; the default +simulator does A-then-B ordering and does not exercise this path. + +### Diagnosing / probing QAT health + +If wolfCrypt prints `Could not start sal for user space`, `SalCtrl_ServiceInit Failed to initialise all service instances`, or falls back to `Running without async`, the user-space SAL could not bring up the device. Useful probes: + +```sh +sudo adf_ctl status # per-device kernel state (look for 'state: down') +sudo dmesg | grep -iE 'c6xx|qat|heartbeat|ras|reset|orphan' # device events +lspci -s 09:00.0 -vv | grep -iE 'UESta|CESta' # PCIe Advanced Error Reporting (a '+' = real bus error) +cat /sys/module/usdm_drv/parameters/max_huge_pages # usdm hugepage mode +grep HugePages_Total /proc/meminfo # actually-reserved hugepages +ls /sys/kernel/debug/qat_c6xx_/ # heartbeat, fw_counters, etc. +``` + +Common causes seen in practice: + +- Hugepage mismatch: `usdm_drv` loaded with `max_huge_pages` > 0 but `HugePages_Total` is smaller (or 0) -- every QAT process then fails its SAL memory init. Either reserve enough hugepages at boot (GRUB `default_hugepagesz=2M hugepagesz=2M hugepages=N`) or reload `usdm_drv` with `max_huge_pages=0` (non-hugepage contiguous memory). Runtime `sysctl vm.nr_hugepages=N` is unreliable once memory is fragmented. +- Orphan rings: `dmesg` shows `Process ... exit with orphan rings`. A QAT process exited (often killed by a test timeout) without an orderly `wolfAsync_HardwareStop`/`icp_sal_userStop`, leaking instance rings. These accumulate (especially under `make -j` test runs) until the device's instance pool is exhausted and new processes can no longer init. Recover with a full driver-stack reload (`systemctl stop qat; rmmod usdm_drv qat_c62x intel_qat; modprobe qat_c62x; modprobe usdm_drv; systemctl start qat`) or a reboot. Always shut down QAT-using processes cleanly; avoid `make -j` (see make check above). +- PCIe Advanced Error Reporting errors (`UESta`/`CESta` flags set) or `dmesg` heartbeat/RAS/reset messages indicate a real firmware hang or bus error; with `AutoResetOnError = 0` (default in the device conf) the device stays down until a manual `adf_ctl reset`. Set `AutoResetOnError = 1` in `/etc/_dev*.conf` for self-healing. Here are some build options for tuning your use: @@ -80,6 +141,7 @@ Here are some build options for tuning your use: 8. `WC_NO_ASYNC_THREADING`: Disables the thread affinity code for optionally linking a thread to a specific QAT instance. To use this feature you must also define `WC_ASYNC_THREAD_BIND`. 9. `WC_ASYNC_BENCH_THREAD_COUNT`: Use specific number of threads for benchmarking. 10. `QAT_HASH_ENABLE_PARTIAL`: Enables partial hashing support, which allows sending blocks to hardware prior to final. Otherwise all hash updates are cached. +11. `QAT_NO_DEV_INTERLEAVE`: Disables interleaving crypto instances across devices. By default the instance list is reordered so consecutive threads land on different QAT devices, so thread counts below the total instance count still exercise every device instead of filling the first one. The QuickAssist v1.6 driver uses its own memory management system in `quickassist_mem.c`. This can be tuned using the following defines: @@ -114,285 +176,38 @@ IntelQA: Instances 2 RSA test passed! ``` -### wolfCrypt Benchmark with QAT 8970 (multi-threaded) +### wolfCrypt Benchmark with QAT -Multiple concurrent threads will be started based on the number of CPU's available. If you want to exclude the software benchmarks use `./configure CFLAGS="-DNO_SW_BENCH"`. +Multiple concurrent threads are started based on the number of CPUs available. To exclude the software benchmarks use `./configure CFLAGS="-DNO_SW_BENCH"`. -``` -Intel QuickAssist DH8950 on Intel(R) Xeon(R) CPU E5-2678 v3 @ 2.50GHz: - -Recommended wolfSSL build options when benchmarking. -$ ./configure --enable-sp --enable-sp-asm --enable-aesni --enable-intelasm --enable-intelrand --enable-keygen --enable-sha3 --enable-asynccrypt --with-intelqa=../QAT CFLAGS="-DWC_ASYNC_THRESH_NONE -DQAT_MAX_PENDING=40 -DWC_ASYNC_BENCH_THREAD_COUNT=2" -$ make - -$ sudo ./wolfcrypt/benchmark/benchmark -rsa_sign -base10 -threads 2 -print ------------------------------------------------------------------------------- - wolfSSL version 4.5.0 ------------------------------------------------------------------------------- -IntelQA: Instances 18 -wolfCrypt Benchmark (block bytes 1048576, min 1.0 sec each) -CPUs: 2 -RNG SW 79 mB took 1.030 seconds, 76.388 mB/s Cycles per byte = 32.65 -RNG SW 79 mB took 1.042 seconds, 75.456 mB/s Cycles per byte = 33.05 -AES-128-CBC-enc SW 729 mB took 1.006 seconds, 724.266 mB/s Cycles per byte = 3.44 -AES-128-CBC-enc SW 729 mB took 1.007 seconds, 723.825 mB/s Cycles per byte = 3.45 -AES-128-CBC-dec SW 5185 mB took 1.000 seconds, 5184.260 mB/s Cycles per byte = 0.48 -AES-128-CBC-dec SW 5190 mB took 1.000 seconds, 5189.351 mB/s Cycles per byte = 0.48 -AES-192-CBC-enc SW 608 mB took 1.003 seconds, 606.175 mB/s Cycles per byte = 4.11 -AES-192-CBC-enc SW 608 mB took 1.004 seconds, 605.855 mB/s Cycles per byte = 4.12 -AES-192-CBC-dec SW 4325 mB took 1.000 seconds, 4325.333 mB/s Cycles per byte = 0.58 -AES-192-CBC-dec SW 4331 mB took 1.001 seconds, 4325.809 mB/s Cycles per byte = 0.58 -AES-256-CBC-enc SW 524 mB took 1.005 seconds, 521.465 mB/s Cycles per byte = 4.78 -AES-256-CBC-enc SW 524 mB took 1.006 seconds, 521.190 mB/s Cycles per byte = 4.79 -AES-256-CBC-dec SW 3707 mB took 1.000 seconds, 3705.767 mB/s Cycles per byte = 0.67 -AES-256-CBC-dec SW 3707 mB took 1.001 seconds, 3703.024 mB/s Cycles per byte = 0.67 -AES-128-CBC-enc HW 2443 mB took 1.000 seconds, 2442.819 mB/s Cycles per byte = 1.02 -AES-128-CBC-enc HW 2443 mB took 1.000 seconds, 2442.770 mB/s Cycles per byte = 1.02 -AES-128-CBC-dec HW 2380 mB took 1.001 seconds, 2378.716 mB/s Cycles per byte = 1.05 -AES-128-CBC-dec HW 2380 mB took 1.001 seconds, 2378.657 mB/s Cycles per byte = 1.05 -AES-192-CBC-enc HW 2365 mB took 1.002 seconds, 2359.520 mB/s Cycles per byte = 1.06 -AES-192-CBC-enc HW 2365 mB took 1.002 seconds, 2359.471 mB/s Cycles per byte = 1.06 -AES-192-CBC-dec HW 2417 mB took 1.002 seconds, 2411.874 mB/s Cycles per byte = 1.03 -AES-192-CBC-dec HW 2417 mB took 1.002 seconds, 2411.831 mB/s Cycles per byte = 1.03 -AES-256-CBC-enc HW 2223 mB took 1.001 seconds, 2221.082 mB/s Cycles per byte = 1.12 -AES-256-CBC-enc HW 2218 mB took 1.001 seconds, 2215.793 mB/s Cycles per byte = 1.13 -AES-256-CBC-dec HW 2113 mB took 1.002 seconds, 2108.506 mB/s Cycles per byte = 1.18 -AES-256-CBC-dec HW 2113 mB took 1.002 seconds, 2108.354 mB/s Cycles per byte = 1.18 -AES-128-GCM-enc SW 1919 mB took 1.001 seconds, 1916.366 mB/s Cycles per byte = 1.30 -AES-128-GCM-enc SW 2595 mB took 1.001 seconds, 2591.465 mB/s Cycles per byte = 0.96 -AES-128-GCM-dec SW 2611 mB took 1.000 seconds, 2610.093 mB/s Cycles per byte = 0.96 -AES-128-GCM-dec SW 2218 mB took 1.002 seconds, 2213.073 mB/s Cycles per byte = 1.13 -AES-192-GCM-enc SW 2317 mB took 1.001 seconds, 2315.896 mB/s Cycles per byte = 1.08 -AES-192-GCM-enc SW 2286 mB took 1.002 seconds, 2281.953 mB/s Cycles per byte = 1.09 -AES-192-GCM-dec SW 2207 mB took 1.001 seconds, 2206.098 mB/s Cycles per byte = 1.13 -AES-192-GCM-dec SW 1589 mB took 1.002 seconds, 1586.020 mB/s Cycles per byte = 1.57 -AES-256-GCM-enc SW 2071 mB took 1.001 seconds, 2069.342 mB/s Cycles per byte = 1.21 -AES-256-GCM-enc SW 2108 mB took 1.002 seconds, 2103.268 mB/s Cycles per byte = 1.19 -AES-256-GCM-dec SW 2108 mB took 1.001 seconds, 2105.715 mB/s Cycles per byte = 1.18 -AES-256-GCM-dec SW 2108 mB took 1.002 seconds, 2103.563 mB/s Cycles per byte = 1.19 -AES-128-GCM-enc HW 2427 mB took 1.002 seconds, 2422.522 mB/s Cycles per byte = 1.03 -AES-128-GCM-enc HW 2433 mB took 1.002 seconds, 2427.722 mB/s Cycles per byte = 1.03 -AES-128-GCM-dec HW 1861 mB took 1.001 seconds, 1860.039 mB/s Cycles per byte = 1.34 -AES-128-GCM-dec HW 1861 mB took 1.001 seconds, 1860.019 mB/s Cycles per byte = 1.34 -AES-192-GCM-enc HW 2380 mB took 1.000 seconds, 2379.218 mB/s Cycles per byte = 1.05 -AES-192-GCM-enc HW 2386 mB took 1.000 seconds, 2384.418 mB/s Cycles per byte = 1.05 -AES-192-GCM-dec HW 1971 mB took 1.002 seconds, 1966.480 mB/s Cycles per byte = 1.27 -AES-192-GCM-dec HW 1971 mB took 1.002 seconds, 1966.458 mB/s Cycles per byte = 1.27 -AES-256-GCM-enc HW 2254 mB took 1.002 seconds, 2249.535 mB/s Cycles per byte = 1.11 -AES-256-GCM-enc HW 2254 mB took 1.002 seconds, 2249.487 mB/s Cycles per byte = 1.11 -AES-256-GCM-dec HW 1746 mB took 1.001 seconds, 1744.049 mB/s Cycles per byte = 1.43 -AES-256-GCM-dec HW 1746 mB took 1.001 seconds, 1744.018 mB/s Cycles per byte = 1.43 -CHACHA SW 1478 mB took 1.000 seconds, 1478.220 mB/s Cycles per byte = 1.69 -CHACHA SW 1347 mB took 1.003 seconds, 1342.833 mB/s Cycles per byte = 1.86 -CHA-POLY SW 949 mB took 1.002 seconds, 946.915 mB/s Cycles per byte = 2.63 -CHA-POLY SW 949 mB took 1.005 seconds, 944.670 mB/s Cycles per byte = 2.64 -MD5 SW 603 mB took 1.003 seconds, 601.383 mB/s Cycles per byte = 4.15 -MD5 SW 613 mB took 1.005 seconds, 610.413 mB/s Cycles per byte = 4.09 -MD5 HW 409 mB took 1.002 seconds, 408.088 mB/s Cycles per byte = 6.11 -MD5 HW 409 mB took 1.003 seconds, 407.845 mB/s Cycles per byte = 6.12 -POLY1305 SW 2621 mB took 1.000 seconds, 2620.709 mB/s Cycles per byte = 0.95 -POLY1305 SW 2616 mB took 1.001 seconds, 2613.824 mB/s Cycles per byte = 0.95 -SHA SW 377 mB took 1.003 seconds, 376.342 mB/s Cycles per byte = 6.63 -SHA SW 383 mB took 1.011 seconds, 378.592 mB/s Cycles per byte = 6.59 -SHA HW 535 mB took 1.005 seconds, 531.941 mB/s Cycles per byte = 4.69 -SHA HW 535 mB took 1.006 seconds, 531.644 mB/s Cycles per byte = 4.69 -SHA-224 SW 351 mB took 1.010 seconds, 347.715 mB/s Cycles per byte = 7.17 -SHA-224 SW 351 mB took 1.014 seconds, 346.285 mB/s Cycles per byte = 7.20 -SHA-224 HW 414 mB took 1.012 seconds, 409.434 mB/s Cycles per byte = 6.09 -SHA-224 HW 419 mB took 1.012 seconds, 414.387 mB/s Cycles per byte = 6.02 -SHA-256 SW 351 mB took 1.011 seconds, 347.292 mB/s Cycles per byte = 7.18 -SHA-256 SW 315 mB took 1.013 seconds, 310.424 mB/s Cycles per byte = 8.03 -SHA-256 HW 419 mB took 1.004 seconds, 417.688 mB/s Cycles per byte = 5.97 -SHA-256 HW 419 mB took 1.005 seconds, 417.427 mB/s Cycles per byte = 5.98 -SHA-384 SW 530 mB took 1.001 seconds, 529.040 mB/s Cycles per byte = 4.71 -SHA-384 SW 530 mB took 1.003 seconds, 528.139 mB/s Cycles per byte = 4.72 -SHA-384 HW 357 mB took 1.001 seconds, 356.156 mB/s Cycles per byte = 7.00 -SHA-384 HW 367 mB took 1.010 seconds, 363.498 mB/s Cycles per byte = 6.86 -SHA-512 SW 530 mB took 1.002 seconds, 528.589 mB/s Cycles per byte = 4.72 -SHA-512 SW 446 mB took 1.009 seconds, 441.540 mB/s Cycles per byte = 5.65 -SHA-512 HW 367 mB took 1.004 seconds, 365.434 mB/s Cycles per byte = 6.83 -SHA-512 HW 367 mB took 1.005 seconds, 365.224 mB/s Cycles per byte = 6.83 -SHA3-224 SW 236 mB took 1.014 seconds, 232.784 mB/s Cycles per byte = 10.71 -SHA3-224 SW 236 mB took 1.018 seconds, 231.794 mB/s Cycles per byte = 10.76 -SHA3-224 HW 220 mB took 1.006 seconds, 218.860 mB/s Cycles per byte = 11.40 -SHA3-224 HW 236 mB took 1.015 seconds, 232.538 mB/s Cycles per byte = 10.73 -SHA3-256 SW 163 mB took 1.000 seconds, 162.463 mB/s Cycles per byte = 15.35 -SHA3-256 SW 225 mB took 1.023 seconds, 220.278 mB/s Cycles per byte = 11.32 -SHA3-256 HW 692 mB took 1.004 seconds, 689.291 mB/s Cycles per byte = 3.62 -SHA3-256 HW 692 mB took 1.007 seconds, 687.092 mB/s Cycles per byte = 3.63 -SHA3-384 SW 173 mB took 1.022 seconds, 169.214 mB/s Cycles per byte = 14.74 -SHA3-384 SW 173 mB took 1.024 seconds, 168.878 mB/s Cycles per byte = 14.77 -SHA3-384 HW 173 mB took 1.023 seconds, 169.202 mB/s Cycles per byte = 14.74 -SHA3-384 HW 173 mB took 1.024 seconds, 168.948 mB/s Cycles per byte = 14.76 -SHA3-512 SW 121 mB took 1.026 seconds, 117.548 mB/s Cycles per byte = 21.22 -SHA3-512 SW 121 mB took 1.027 seconds, 117.375 mB/s Cycles per byte = 21.25 -SHA3-512 HW 121 mB took 1.026 seconds, 117.585 mB/s Cycles per byte = 21.21 -SHA3-512 HW 121 mB took 1.028 seconds, 117.335 mB/s Cycles per byte = 21.26 -HMAC-MD5 SW 608 mB took 1.000 seconds, 608.096 mB/s Cycles per byte = 4.10 -HMAC-MD5 SW 613 mB took 1.004 seconds, 611.102 mB/s Cycles per byte = 4.08 -HMAC-MD5 HW 414 mB took 1.001 seconds, 413.762 mB/s Cycles per byte = 6.03 -HMAC-MD5 HW 414 mB took 1.004 seconds, 412.554 mB/s Cycles per byte = 6.05 -HMAC-SHA SW 383 mB took 1.011 seconds, 378.446 mB/s Cycles per byte = 6.59 -HMAC-SHA SW 383 mB took 1.013 seconds, 377.729 mB/s Cycles per byte = 6.60 -HMAC-SHA HW 535 mB took 1.008 seconds, 530.760 mB/s Cycles per byte = 4.70 -HMAC-SHA HW 514 mB took 1.009 seconds, 509.292 mB/s Cycles per byte = 4.90 -HMAC-SHA224 SW 267 mB took 1.008 seconds, 265.316 mB/s Cycles per byte = 9.40 -HMAC-SHA224 SW 351 mB took 1.012 seconds, 346.982 mB/s Cycles per byte = 7.19 -HMAC-SHA224 HW 404 mB took 1.003 seconds, 402.579 mB/s Cycles per byte = 6.20 -HMAC-SHA224 HW 393 mB took 1.011 seconds, 388.951 mB/s Cycles per byte = 6.41 -HMAC-SHA256 SW 294 mB took 1.007 seconds, 291.426 mB/s Cycles per byte = 8.56 -HMAC-SHA256 SW 351 mB took 1.012 seconds, 347.205 mB/s Cycles per byte = 7.18 -HMAC-SHA256 HW 419 mB took 1.004 seconds, 417.677 mB/s Cycles per byte = 5.97 -HMAC-SHA256 HW 419 mB took 1.009 seconds, 415.514 mB/s Cycles per byte = 6.00 -HMAC-SHA384 SW 530 mB took 1.002 seconds, 528.479 mB/s Cycles per byte = 4.72 -HMAC-SHA384 SW 530 mB took 1.007 seconds, 526.093 mB/s Cycles per byte = 4.74 -HMAC-SHA384 HW 367 mB took 1.004 seconds, 365.498 mB/s Cycles per byte = 6.82 -HMAC-SHA384 HW 367 mB took 1.006 seconds, 364.878 mB/s Cycles per byte = 6.84 -HMAC-SHA512 SW 530 mB took 1.002 seconds, 528.616 mB/s Cycles per byte = 4.72 -HMAC-SHA512 SW 530 mB took 1.006 seconds, 526.513 mB/s Cycles per byte = 4.74 -HMAC-SHA512 HW 367 mB took 1.003 seconds, 365.816 mB/s Cycles per byte = 6.82 -HMAC-SHA512 HW 367 mB took 1.007 seconds, 364.560 mB/s Cycles per byte = 6.84 -RSA 1024 key gen SW 40 ops took 1.191 sec, avg 29.780 ms, 33.580 ops/sec -RSA 1024 key gen SW 40 ops took 1.428 sec, avg 35.694 ms, 28.016 ops/sec -RSA 2048 key gen SW 40 ops took 4.154 sec, avg 103.853 ms, 9.629 ops/sec -RSA 2048 key gen SW 40 ops took 5.687 sec, avg 142.172 ms, 7.034 ops/sec -RSA 1024 key gen HW 120 ops took 1.064 sec, avg 8.866 ms, 112.790 ops/sec -RSA 1024 key gen HW 120 ops took 1.072 sec, avg 8.932 ms, 111.953 ops/sec -RSA 2048 key gen HW 40 ops took 1.389 sec, avg 34.717 ms, 28.804 ops/sec -RSA 2048 key gen HW 40 ops took 1.437 sec, avg 35.935 ms, 27.828 ops/sec -RSA 2048 sign SW 1000 ops took 1.046 sec, avg 1.046 ms, 956.197 ops/sec -RSA 2048 sign SW 1000 ops took 1.052 sec, avg 1.052 ms, 950.320 ops/sec -RSA 2048 verify SW 32300 ops took 1.001 sec, avg 0.031 ms, 32271.670 ops/sec -RSA 2048 verify SW 32200 ops took 1.003 sec, avg 0.031 ms, 32117.110 ops/sec -RSA 2048 sign HW 12300 ops took 1.001 sec, avg 0.081 ms, 12288.056 ops/sec -RSA 2048 sign HW 19600 ops took 1.003 sec, avg 0.051 ms, 19537.967 ops/sec -RSA 2048 verify HW 116000 ops took 1.000 sec, avg 0.009 ms, 115971.935 ops/sec -RSA 2048 verify HW 118000 ops took 1.000 sec, avg 0.008 ms, 117962.707 ops/sec -DH 2048 key gen SW 2080 ops took 1.000 sec, avg 0.481 ms, 2079.830 ops/sec -DH 2048 key gen SW 2120 ops took 1.016 sec, avg 0.479 ms, 2086.548 ops/sec -DH 2048 agree SW 2100 ops took 1.023 sec, avg 0.487 ms, 2053.478 ops/sec -DH 2048 agree SW 2100 ops took 1.026 sec, avg 0.489 ms, 2046.644 ops/sec -DH 2048 key gen HW 43720 ops took 1.000 sec, avg 0.023 ms, 43712.257 ops/sec -DH 2048 key gen HW 43320 ops took 1.000 sec, avg 0.023 ms, 43299.560 ops/sec -DH 2048 agree HW 32500 ops took 1.001 sec, avg 0.031 ms, 32471.874 ops/sec -DH 2048 agree HW 39400 ops took 1.001 sec, avg 0.025 ms, 39351.757 ops/sec -ECC 256 key gen SW 41320 ops took 1.001 sec, avg 0.024 ms, 41298.692 ops/sec -ECC 256 key gen SW 41280 ops took 1.001 sec, avg 0.024 ms, 41258.674 ops/sec -ECC 256 key gen HW 41320 ops took 1.000 sec, avg 0.024 ms, 41309.127 ops/sec -ECC 256 key gen HW 41280 ops took 1.001 sec, avg 0.024 ms, 41244.118 ops/sec -ECDHE 256 agree SW 13400 ops took 1.005 sec, avg 0.075 ms, 13328.731 ops/sec -ECDHE 256 agree SW 13300 ops took 1.006 sec, avg 0.076 ms, 13221.465 ops/sec -ECDSA 256 sign SW 29900 ops took 1.002 sec, avg 0.034 ms, 29841.744 ops/sec -ECDSA 256 sign SW 30000 ops took 1.003 sec, avg 0.033 ms, 29910.091 ops/sec -ECDSA 256 verify SW 10700 ops took 1.006 sec, avg 0.094 ms, 10641.471 ops/sec -ECDSA 256 verify SW 10700 ops took 1.009 sec, avg 0.094 ms, 10604.105 ops/sec -ECDHE 256 agree HW 26600 ops took 1.000 sec, avg 0.038 ms, 26594.522 ops/sec -ECDHE 256 agree HW 19000 ops took 1.002 sec, avg 0.053 ms, 18964.479 ops/sec -ECDSA 256 sign HW 22300 ops took 1.001 sec, avg 0.045 ms, 22286.137 ops/sec -ECDSA 256 sign HW 22000 ops took 1.002 sec, avg 0.046 ms, 21963.146 ops/sec -ECDSA 256 verify HW 12600 ops took 1.002 sec, avg 0.080 ms, 12569.531 ops/sec -ECDSA 256 verify HW 12600 ops took 1.005 sec, avg 0.080 ms, 12542.829 ops/sec -Benchmark complete -RNG SW 151.844 mB/s -AES-128-CBC-enc SW 1448.090 mB/s -AES-128-CBC-dec SW 10373.612 mB/s -AES-192-CBC-enc SW 1212.030 mB/s -AES-192-CBC-dec SW 8651.141 mB/s -AES-256-CBC-enc SW 1042.655 mB/s -AES-256-CBC-dec SW 7408.791 mB/s -AES-128-CBC-enc HW 4885.588 mB/s -AES-128-CBC-dec HW 4757.373 mB/s -AES-192-CBC-enc HW 4718.991 mB/s -AES-192-CBC-dec HW 4823.705 mB/s -AES-256-CBC-enc HW 4436.875 mB/s -AES-256-CBC-dec HW 4216.860 mB/s -AES-128-GCM-enc SW 4507.831 mB/s -AES-128-GCM-dec SW 4823.166 mB/s -AES-192-GCM-enc SW 4597.849 mB/s -AES-192-GCM-dec SW 3792.119 mB/s -AES-256-GCM-enc SW 4172.610 mB/s -AES-256-GCM-dec SW 4209.278 mB/s -AES-128-GCM-enc HW 4850.244 mB/s -AES-128-GCM-dec HW 3720.058 mB/s -AES-192-GCM-enc HW 4763.636 mB/s -AES-192-GCM-dec HW 3932.937 mB/s -AES-256-GCM-enc HW 4499.022 mB/s -AES-256-GCM-dec HW 3488.068 mB/s -CHACHA SW 2821.053 mB/s -CHA-POLY SW 1891.585 mB/s -MD5 SW 1211.796 mB/s -MD5 HW 815.933 mB/s -POLY1305 SW 5234.533 mB/s -SHA SW 754.934 mB/s -SHA HW 1063.586 mB/s -SHA-224 SW 694.001 mB/s -SHA-224 HW 823.821 mB/s -SHA-256 SW 657.716 mB/s -SHA-256 HW 835.115 mB/s -SHA-384 SW 1057.178 mB/s -SHA-384 HW 719.655 mB/s -SHA-512 SW 970.129 mB/s -SHA-512 HW 730.657 mB/s -SHA3-224 SW 464.579 mB/s -SHA3-224 HW 451.398 mB/s -SHA3-256 SW 382.741 mB/s -SHA3-256 HW 1376.382 mB/s -SHA3-384 SW 338.092 mB/s -SHA3-384 HW 338.150 mB/s -SHA3-512 SW 234.923 mB/s -SHA3-512 HW 234.921 mB/s -HMAC-MD5 SW 1219.198 mB/s -HMAC-MD5 HW 826.316 mB/s -HMAC-SHA SW 756.175 mB/s -HMAC-SHA HW 1040.052 mB/s -HMAC-SHA224 SW 612.297 mB/s -HMAC-SHA224 HW 791.530 mB/s -HMAC-SHA256 SW 638.631 mB/s -HMAC-SHA256 HW 833.191 mB/s -HMAC-SHA384 SW 1054.571 mB/s -HMAC-SHA384 HW 730.376 mB/s -HMAC-SHA512 SW 1055.130 mB/s -HMAC-SHA512 HW 730.377 mB/s -RSA 1024 key gen SW 61.596 ops/sec -RSA 2048 key gen SW 16.663 ops/sec -RSA 1024 key gen HW 224.743 ops/sec -RSA 2048 key gen HW 56.632 ops/sec -RSA 2048 sign SW 1906.517 ops/sec -RSA 2048 verify SW 64388.780 ops/sec -RSA 2048 sign HW 31826.022 ops/sec -RSA 2048 verify HW 233934.642 ops/sec -DH 2048 key gen SW 4166.378 ops/sec -DH 2048 agree SW 4100.122 ops/sec -DH 2048 key gen HW 87011.816 ops/sec -DH 2048 agree HW 71823.630 ops/sec -ECC 256 key gen SW 82557.366 ops/sec -ECC 256 key gen HW 82553.245 ops/sec -ECDHE 256 agree SW 26550.196 ops/sec -ECDSA 256 sign SW 59751.835 ops/sec -ECDSA 256 verify SW 21245.576 ops/sec -ECDHE 256 agree HW 45559.001 ops/sec -ECDSA 256 sign HW 44249.283 ops/sec -ECDSA 256 verify HW 25112.360 ops/sec -IntelQA: Stop -``` +To fully utilize a multi-device system, use at least as many benchmark threads as there are crypto instances (`-threads N`, where N >= the "IntelQA: Instances" count), since each thread is bound to one instance. Instances are spread across devices by default (`IntelQaInterleaveInstances`, disable with `QAT_NO_DEV_INTERLEAVE`), so even a thread count lower than the instance count exercises every device rather than filling the first one. For maximum throughput also raise the in-flight depth with `CFLAGS="-DQAT_MAX_PENDING=40 -DWC_ASYNC_THRESH_NONE"`. Example on a 3-device (18-instance) system: `-threads 18` or higher. -### wolfCrypt Benchmark with QAT (single-threaded) +#### Latest measured performance (3x Intel C62x) -To use the benchmark tool against hardware in single threaded mode build the library with `CFLAGS="-DWC_NO_ASYNC_THREADING"`. +Host: Intel Core i9-14900K; 3x Intel C62x Chipset QuickAssist Technology (rev 04), 18 crypto instances; wolfSSL 5.9.1; `./configure --enable-asynccrypt --with-intelqa=../QAT`. -``` -sudo ./wolfcrypt/benchmark/benchmark -rsa_sign -dh -ecc -IntelQA: Instances 2 -wolfCrypt Benchmark (block bytes 1048576, min 1.0 sec each) -RSA 2048 public HW 161000 ops took 1.000 sec, avg 0.006 ms, 160989.829 ops/sec -RSA 2048 private HW 18600 ops took 1.002 sec, avg 0.054 ms, 18566.416 ops/sec -DH 2048 key gen HW 48945 ops took 1.000 sec, avg 0.020 ms, 48931.782 ops/sec -DH 2048 agree HW 43300 ops took 1.001 sec, avg 0.023 ms, 43248.876 ops/sec -ECDHE 256 agree HW 26400 ops took 1.001 sec, avg 0.038 ms, 26382.639 ops/sec -ECDSA 256 sign HW 23900 ops took 1.004 sec, avg 0.042 ms, 23810.849 ops/sec -ECDSA 256 verify HW 13800 ops took 1.000 sec, avg 0.072 ms, 13799.878 ops/sec -IntelQA: Stop -``` +Public-key throughput aggregated across all 3 devices (`-threads 18`, hardware): + +| algorithm | HW ops/sec | +|---|---| +| RSA-2048 private | 76,865 | +| RSA-2048 public | 29,428 | +| ECDSA-256 sign | 45,152 | +| ECDSA-256 verify | 72,200 | +| ECDHE-256 agree | 100,514 | +| DH-2048 agree | 141,390 | +| DH-2048 key gen | 58,418 | + +Bulk AES throughput, single instance (`-threads 1`, hardware). On this AES-NI host the software AES path is faster, so QAT AES is gated behind size thresholds (`WC_ASYNC_THRESH_*`) and `WC_ASYNC_NO_HASH` is recommended; the offload value here is the public-key work above. + +| algorithm | enc MB/s | dec MB/s | +|---|---|---| +| AES-128-CBC | 552 | 619 | +| AES-256-CBC | 411 | 443 | +| AES-128-GCM | 256 | 126 | +| AES-256-GCM | 219 | 117 | + +Note: higher-thread AES on this host hits usdm contiguous-memory exhaustion without boot-time hugepages (see the diagnostics section above); the public-key benchmarks use small buffers and scale cleanly to all 18 instances. ### wolfSSL Asynchronous Test Mode From f51c76da0dbe25c786068b94ec5a00fd8477062a Mon Sep 17 00:00:00 2001 From: David Garske Date: Tue, 30 Jun 2026 10:00:00 -0700 Subject: [PATCH 7/7] Intel QAT: add async hybrid PQC server key share regression test --- tests/api/test_tls13.c | 60 +++++++++++++++++++++++++++++++++++++++ tests/api/test_tls13.h | 4 ++- tests/utils.c | 14 +++++++++ wolfcrypt/src/async.c | 12 ++++++++ wolfssl/wolfcrypt/async.h | 4 +++ 5 files changed, 93 insertions(+), 1 deletion(-) diff --git a/tests/api/test_tls13.c b/tests/api/test_tls13.c index e4d5c3821dc..2dd088d5133 100644 --- a/tests/api/test_tls13.c +++ b/tests/api/test_tls13.c @@ -5488,6 +5488,66 @@ int test_tls13_pqc_hybrid_malformed_ecdh(void) return EXPECT_RESULT(); } +/* Regression test for the async hybrid PQC server key share. Drives a full + * TLS 1.3 P-256 + ML-KEM-768 handshake through the software async simulator + * while forcing the server's ECDH keygen to complete synchronously so that + * only the ECDH shared-secret derivation suspends. This "B-first" ordering is + * what Intel QAT exhibits and previously dropped the server's KEM ciphertext, + * failing the handshake with SSL_connect -173. */ +int test_tls13_pqc_hybrid_async_server(void) +{ + EXPECT_DECLS; +#if defined(WOLFSSL_TLS13) && defined(WOLFSSL_ASYNC_CRYPT) && \ + defined(WOLFSSL_ASYNC_CRYPT_SW) && \ + defined(HAVE_MANUAL_MEMIO_TESTS_DEPENDENCIES) && \ + defined(WOLFSSL_HAVE_MLKEM) && defined(WOLFSSL_PQC_HYBRIDS) && \ + !defined(WOLFSSL_MLKEM_NO_ENCAPSULATE) && \ + !defined(WOLFSSL_MLKEM_NO_MAKE_KEY) && \ + !defined(WOLFSSL_MLKEM_NO_DECAPSULATE) && \ + !defined(WOLFSSL_NO_ML_KEM_768) && defined(HAVE_ECC) && \ + !defined(NO_WOLFSSL_CLIENT) && !defined(NO_WOLFSSL_SERVER) && \ + (!defined(NO_ECC256) || defined(HAVE_ALL_CURVES)) && !defined(NO_ECC_SECP) + struct test_memio_ctx test_ctx; + WOLFSSL_CTX *ctx_c = NULL, *ctx_s = NULL; + WOLFSSL *ssl_c = NULL, *ssl_s = NULL; + int group = WOLFSSL_SECP256R1MLKEM768; + int devId = INVALID_DEVID; + + /* Open the software async device so the handshake runs the async path. */ + ExpectIntEQ(wolfAsync_DevOpen(&devId), 0); + + XMEMSET(&test_ctx, 0, sizeof(test_ctx)); + ExpectIntEQ(test_memio_setup(&test_ctx, &ctx_c, &ctx_s, &ssl_c, &ssl_s, + wolfTLSv1_3_client_method, wolfTLSv1_3_server_method), 0); + + ExpectIntEQ(wolfSSL_SetDevId(ssl_c, devId), WOLFSSL_SUCCESS); + ExpectIntEQ(wolfSSL_SetDevId(ssl_s, devId), WOLFSSL_SUCCESS); + + /* Negotiate the P-256 + ML-KEM-768 hybrid group on both ends. */ + ExpectIntEQ(wolfSSL_set_groups(ssl_c, &group, 1), WOLFSSL_SUCCESS); + ExpectIntEQ(wolfSSL_set_groups(ssl_s, &group, 1), WOLFSSL_SUCCESS); + + /* Force the server's ECDH keygen to run synchronously so that only the + * ECDH shared-secret derivation suspends (the QAT "B-first" ordering). */ + wolfAsync_SwForceSyncType(ASYNC_SW_ECC_MAKE); + + ExpectIntEQ(test_memio_do_handshake(ssl_c, ssl_s, 20, NULL), 0); + + /* Server selected and completed the hybrid group end-to-end. */ + ExpectIntEQ(ssl_s->namedGroup, WOLFSSL_SECP256R1MLKEM768); + + /* Restore default simulator ordering for subsequent tests. */ + wolfAsync_SwForceSyncType(ASYNC_SW_NONE); + + wolfSSL_free(ssl_c); + wolfSSL_CTX_free(ctx_c); + wolfSSL_free(ssl_s); + wolfSSL_CTX_free(ctx_s); + wolfAsync_DevClose(&devId); +#endif + return EXPECT_RESULT(); +} + /* Test that a TLS 1.3 NewSessionTicket with a ticket shorter than ID_LEN * (32 bytes) does not cause an unsigned integer underflow / OOB read in * SetTicket. Uses a full memio handshake, then injects a crafted diff --git a/tests/api/test_tls13.h b/tests/api/test_tls13.h index e16c97e1656..f96f8daaded 100644 --- a/tests/api/test_tls13.h +++ b/tests/api/test_tls13.h @@ -84,6 +84,7 @@ int test_tls13_AEAD_limit_KU_aes128_gcm_sha256(void); int test_tls13_AEAD_limit_KU_aes256_gcm_sha384(void); int test_tls13_AEAD_limit_KU_aes128_ccm_sha256(void); int test_tls13_AEAD_limit_KU_aes128_ccm_8_sha256(void); +int test_tls13_pqc_hybrid_async_server(void); #define TEST_TLS13_DECLS \ TEST_DECL_GROUP("tls13", test_tls13_apis), \ @@ -145,6 +146,7 @@ int test_tls13_AEAD_limit_KU_aes128_ccm_8_sha256(void); TEST_DECL_GROUP("tls13", test_tls13_AEAD_limit_KU_aes128_gcm_sha256), \ TEST_DECL_GROUP("tls13", test_tls13_AEAD_limit_KU_aes256_gcm_sha384), \ TEST_DECL_GROUP("tls13", test_tls13_AEAD_limit_KU_aes128_ccm_sha256), \ - TEST_DECL_GROUP("tls13", test_tls13_AEAD_limit_KU_aes128_ccm_8_sha256) + TEST_DECL_GROUP("tls13", test_tls13_AEAD_limit_KU_aes128_ccm_8_sha256), \ + TEST_DECL_GROUP("tls13", test_tls13_pqc_hybrid_async_server) #endif /* WOLFCRYPT_TEST_TLS13_H */ diff --git a/tests/utils.c b/tests/utils.c index 447746d277f..a8566adc86e 100644 --- a/tests/utils.c +++ b/tests/utils.c @@ -186,6 +186,13 @@ int test_memio_do_handshake(WOLFSSL *ssl_c, WOLFSSL *ssl_s, if (err == WC_NO_ERR_TRACE(MP_WOULDBLOCK)) { /* retry non-blocking math */ } + #ifdef WOLFSSL_ASYNC_CRYPT + else if (err == WC_NO_ERR_TRACE(WC_PENDING_E)) { + ret = wolfSSL_AsyncPoll(ssl_c, WOLF_POLL_FLAG_CHECK_HW); + if (ret < 0) + return -1; + } + #endif else if (err != WOLFSSL_ERROR_WANT_READ && err != WOLFSSL_ERROR_WANT_WRITE) { char buff[WOLFSSL_MAX_ERROR_SZ]; @@ -207,6 +214,13 @@ int test_memio_do_handshake(WOLFSSL *ssl_c, WOLFSSL *ssl_s, if (err == WC_NO_ERR_TRACE(MP_WOULDBLOCK)) { /* retry non-blocking math */ } + #ifdef WOLFSSL_ASYNC_CRYPT + else if (err == WC_NO_ERR_TRACE(WC_PENDING_E)) { + ret = wolfSSL_AsyncPoll(ssl_s, WOLF_POLL_FLAG_CHECK_HW); + if (ret < 0) + return -1; + } + #endif else if (err != WOLFSSL_ERROR_WANT_READ && err != WOLFSSL_ERROR_WANT_WRITE) { char buff[WOLFSSL_MAX_ERROR_SZ]; diff --git a/wolfcrypt/src/async.c b/wolfcrypt/src/async.c index 7ae35b39012..a7016b48082 100644 --- a/wolfcrypt/src/async.c +++ b/wolfcrypt/src/async.c @@ -67,6 +67,15 @@ static WC_ASYNC_DEV* wolfAsync_GetDev(WOLF_EVENT* event) /* Allow way to have async SW code included, and disabled at run-time */ static int wolfAsyncSwDisabled = 0; /* default off */ +/* Test hook: an op of this type runs synchronously instead of suspending. + * Default ASYNC_SW_NONE means no type is forced (no real op uses NONE). */ +static int wolfAsyncSwForceSyncType = ASYNC_SW_NONE; + +void wolfAsync_SwForceSyncType(int type) +{ + wolfAsyncSwForceSyncType = type; +} + static int wolfAsync_DoSw(WC_ASYNC_DEV* asyncDev) { @@ -312,6 +321,9 @@ int wc_AsyncSwInit(WC_ASYNC_DEV* dev, int type) if (dev) { WC_ASYNC_SW* sw = &dev->sw; if (sw->type == ASYNC_SW_NONE) { + /* Test hook: force this op type to run synchronously. */ + if (type == wolfAsyncSwForceSyncType) + return 0; sw->type = type; return 1; } diff --git a/wolfssl/wolfcrypt/async.h b/wolfssl/wolfcrypt/async.h index c2f9163e2cb..f29aee62127 100644 --- a/wolfssl/wolfcrypt/async.h +++ b/wolfssl/wolfcrypt/async.h @@ -427,6 +427,10 @@ WOLFSSL_API int wc_AsyncSleep(word32 ms); #ifdef WOLFSSL_ASYNC_CRYPT_SW WOLFSSL_API int wc_AsyncSwInit(WC_ASYNC_DEV* dev, int type); + /* Test hook: force the given WC_ASYNC_SW_TYPE to complete synchronously + * (do not suspend) so the software simulator can reproduce a specific + * suspend ordering. Pass ASYNC_SW_NONE to disable. */ + WOLFSSL_API void wolfAsync_SwForceSyncType(int type); #endif /* Pthread Helpers */