From 1e17695e25e1d4a7af714a26a65bfad7c19b85eb Mon Sep 17 00:00:00 2001 From: Felix-Gong Date: Fri, 5 Jun 2026 15:03:16 +0000 Subject: [PATCH 1/2] Add RISC-V Zvbc vector CRC32C acceleration Implement CRC32C using Zvbc vector carry-less multiplication (vclmul/vclmulh RVV intrinsics). Processes 4 lanes of 128-bit folding per iteration (64 bytes), with 4-to-1 lane reduction and Barrett reduction for finalization. - Add rv_crc32c_vclmul() using vclmul/vclmulh intrinsics - Add isZvbc() runtime detection via /proc/cpuinfo - Add WITH_RISCV_ZVBC cmake option - Fix macro guards: support __riscv_zvbc without __riscv_zbc - Zvbc preferred over Zbc in Choose_Extend() --- CMakeLists.txt | 5 +- src/butil/crc32c.cc | 212 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 212 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e74007b66..3567a6054d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")) # RISC-V specific optimizations option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF) - if(WITH_RISCV_ZBC) + option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF) + if(WITH_RISCV_ZVBC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc_zvbc") + elseif(WITH_RISCV_ZBC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc") diff --git a/src/butil/crc32c.cc b/src/butil/crc32c.cc index 7de07cf428..143d6fb0ef 100644 --- a/src/butil/crc32c.cc +++ b/src/butil/crc32c.cc @@ -421,8 +421,11 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { return static_cast(l ^ 0xffffffffu); } -#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc) +#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc)) #include +#if defined(__riscv_zvbc) +#include +#endif // RISC-V Zbc carry-less multiplication inline helpers static inline uint64_t rv_clmul(uint64_t a, uint64_t b) { @@ -488,6 +491,7 @@ static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = { // Hardware-accelerated CRC32C using RISC-V Zbc carry-less multiplication. // Processes data in 64-byte chunks with 128-bit folding, then Barrett reduces. +#if defined(__riscv_zbc) static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) { // Convert external CRC to internal register state crc ^= 0xFFFFFFFF; @@ -580,6 +584,7 @@ static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) { // Convert internal register state to external CRC return c ^ 0xFFFFFFFF; } +#endif // __riscv_zbc // Runtime detection: check if RISC-V CPU supports Zbc extension static bool isZbc() { @@ -604,8 +609,195 @@ static bool isZbc() { }(); return zbc_supported; } + +#if defined(__riscv_zvbc) +// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less multiplication. +// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128). +// With VLEN=128, each vector register holds 2 x 64-bit elements. +// 4 lanes are processed using 2 vector register pairs per clmul step. +static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) { + crc ^= 0xFFFFFFFF; + + const uint8_t* p = reinterpret_cast(buf); + size_t n = len; + + if (n < 64) { + return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF; + } + + // Align to 16-byte boundary + uintptr_t mis = (uintptr_t)p & 0xF; + if (mis) { + size_t pre = 16 - mis; + if (pre > n) pre = n; + crc = rv_crc32c_bitwise(crc, p, pre); + p += pre; + n -= pre; + if (n < 64) { + return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF; + } + } + + // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128 + size_t vl = __riscv_vsetvl_e64m1(2); + + // Construct fold constant vectors: {k1, k2} and {k3, k4} + // Each element gets the appropriate constant for its position: + // element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4 + uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] }; + uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] }; + vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl); // {k1, k2} + vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl); // {k3, k4} + + // Load first 64 bytes into 4 vector registers. + // Each vector = one 128-bit lane: {lo_64, hi_64} + vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl); + vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl); + vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl); + vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl); + + // XOR CRC into element 0 of first lane + uint64_t tmp[2]; + __riscv_vse64_v_u64m1(tmp, lane1, vl); + tmp[0] ^= (uint64_t)crc; + lane1 = __riscv_vle64_v_u64m1(tmp, vl); + + p += 64; + n -= 64; + + // Main loop: fold 64 bytes per iteration using vector carry-less multiply. + // + // For each 128-bit lane {lo, hi}, the fold computes: + // new_lo = clmul(lo, k1) ^ clmul(hi, k2) ^ data_lo + // new_hi = clmulh(lo, k1) ^ clmulh(hi, k2) ^ data_hi + // + // With k12_vec = {k1, k2} and element-wise vclmul: + // vclmul(lane, k12_vec) = {clmul(lo, k1), clmul(hi, k2)} (lo halves of products) + // vclmulh(lane, k12_vec) = {clmulh(lo, k1), clmulh(hi, k2)} (hi halves of products) + // + // The 128-bit XOR of (lo*k1) and (hi*k2) decomposes element-wise: + // new_lo = clmul(lo,k1) ^ clmul(hi,k2) = vclmul[0] ^ vclmul[1] + // new_hi = clmulh(lo,k1) ^ clmulh(hi,k2) = vclmulh[0] ^ vclmulh[1] + // + // So we need to XOR across elements. With VLEN=128 (2 elements), we use + // scalar extraction for the cross-element XOR since there's no vector + // permute instruction for just 2 elements that's more efficient. + while (n >= 64) { + vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl); + vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl); + vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl); + vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl); + + // Fold each lane using vector clmul with {k1, k2} + uint64_t lo_r[2], hi_r[2], d_r[2]; + + // Lane 1 + __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane1, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane1, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(d_r, d1, vl); + d_r[0] ^= lo_r[0] ^ lo_r[1]; + d_r[1] ^= hi_r[0] ^ hi_r[1]; + lane1 = __riscv_vle64_v_u64m1(d_r, vl); + + // Lane 2 + __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane2, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane2, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(d_r, d2, vl); + d_r[0] ^= lo_r[0] ^ lo_r[1]; + d_r[1] ^= hi_r[0] ^ hi_r[1]; + lane2 = __riscv_vle64_v_u64m1(d_r, vl); + + // Lane 3 + __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane3, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane3, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(d_r, d3, vl); + d_r[0] ^= lo_r[0] ^ lo_r[1]; + d_r[1] ^= hi_r[0] ^ hi_r[1]; + lane3 = __riscv_vle64_v_u64m1(d_r, vl); + + // Lane 4 + __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane4, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane4, k12_vec, vl), vl); + __riscv_vse64_v_u64m1(d_r, d4, vl); + d_r[0] ^= lo_r[0] ^ lo_r[1]; + d_r[1] ^= hi_r[0] ^ hi_r[1]; + lane4 = __riscv_vle64_v_u64m1(d_r, vl); + + p += 64; + n -= 64; + } + + // Reduce 4 lanes to 1 using {k3, k4} + // Same fold pattern: fold lane_a into lane_b + #define FOLD_INTO(dst, src) do { \ + uint64_t _lo[2], _hi[2], _d[2]; \ + __riscv_vse64_v_u64m1(_lo, __riscv_vclmul_vv_u64m1(src, k34_vec, vl), vl); \ + __riscv_vse64_v_u64m1(_hi, __riscv_vclmulh_vv_u64m1(src, k34_vec, vl), vl); \ + __riscv_vse64_v_u64m1(_d, dst, vl); \ + _d[0] ^= _lo[0] ^ _lo[1]; \ + _d[1] ^= _hi[0] ^ _hi[1]; \ + dst = __riscv_vle64_v_u64m1(_d, vl); \ + } while(0) + + FOLD_INTO(lane2, lane1); // lane2 = fold(lane1) ^ lane2 + FOLD_INTO(lane3, lane2); // lane3 = fold(lane2) ^ lane3 + FOLD_INTO(lane4, lane3); // lane4 = fold(lane3) ^ lane4 + #undef FOLD_INTO + + // Extract final 128-bit state from vector register + uint64_t final_state[2]; + __riscv_vse64_v_u64m1(final_state, lane4, vl); + uint64_t x0 = final_state[0]; + uint64_t x1 = final_state[1]; + + // Barrett reduction: 128-bit -> 32-bit CRC (scalar) + uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1); + uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1); + uint64_t t1 = x1 ^ t4; + t4 = t1 & RV_CRC32_MASK32; + t1 >>= 32; + uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0); + t3 = (t3 << 32) ^ t1 ^ t0; + + t4 = t3 & RV_CRC32_MASK32; + t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO); + t4 &= RV_CRC32_MASK32; + t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY); + t4 ^= t3; + + uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32); + if (n) { + c = rv_crc32c_bitwise(c, p, n); + } + return c ^ 0xFFFFFFFF; +} + +// Runtime detection: check if RISC-V CPU supports Zvbc extension +static bool isZvbc() { + static const bool zvbc_supported = []() { + FILE* f = fopen("/proc/cpuinfo", "r"); + if (!f) return false; + bool supported = false; + char line[1024]; + while (fgets(line, sizeof(line), f)) { + if (strstr(line, "isa") || strstr(line, "hart isa")) { + char* colon = strchr(line, ':'); + if (colon) { + if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) { + supported = true; + break; + } + } + } + } + fclose(f); + return supported; + }(); + return zvbc_supported; } -#endif // __riscv && __riscv_xlen == 64 +#endif // __riscv_zvbc + +#endif // __riscv && __riscv_xlen == 64 && (__riscv_zbc || __riscv_zvbc) // Detect if SSE4.2 or not. #ifdef __SSE4_2__ @@ -629,10 +821,17 @@ static inline Function Choose_Extend() { return (Function)ExtendImpl; } #endif -#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc) +#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc)) +#if defined(__riscv_zvbc) + if (isZvbc()) { + return (Function)rv_crc32c_vclmul; + } +#endif +#if defined(__riscv_zbc) if (isZbc()) { return (Function)rv_crc32c_clmul; } +#endif #endif return (Function)ExtendImpl; } @@ -641,8 +840,13 @@ bool IsFastCrc32Supported() { #ifdef __SSE4_2__ if (isSSE42()) return true; #endif -#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc) +#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc)) +#if defined(__riscv_zvbc) + if (isZvbc()) return true; +#endif +#if defined(__riscv_zbc) if (isZbc()) return true; +#endif #endif return false; } From b92a25ac6373c92e833cb4dfbee7e68732cf42bf Mon Sep 17 00:00:00 2001 From: Felix-Gong Date: Sat, 6 Jun 2026 04:38:49 +0000 Subject: [PATCH 2/2] Address review comments: fix vl check, aliasing, isa matching, march flag - Add vl<2 fallback for VLEN<128 (prevents UB from uninitialized elements) - Use memcpy instead of uint8_t*-to-uint64_t* casts (strict-aliasing safe) - Tighten /proc/cpuinfo ISA matching to _zbc/_zvbc only (prevents _zvbc falsely matching zbc) - Add missing 'v' base extension in -march flag (rv64gcv_zbc_zvbc) --- CMakeLists.txt | 2 +- src/butil/crc32c.cc | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3567a6054d..0478ecf898 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,7 +175,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF) option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF) if(WITH_RISCV_ZVBC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc_zvbc") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv_zbc_zvbc") elseif(WITH_RISCV_ZBC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc") else() diff --git a/src/butil/crc32c.cc b/src/butil/crc32c.cc index 143d6fb0ef..1a2fce1a26 100644 --- a/src/butil/crc32c.cc +++ b/src/butil/crc32c.cc @@ -597,7 +597,7 @@ static bool isZbc() { if (strstr(line, "isa") || strstr(line, "hart isa")) { char* colon = strchr(line, ':'); if (colon) { - if (strstr(colon, "_zbc") || strstr(colon, "zbc")) { + if (strstr(colon, "_zbc")) { supported = true; break; } @@ -639,7 +639,11 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) { } // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128 + // If VLEN < 128, vl will be 1 and the vector path cannot be used; fall back. size_t vl = __riscv_vsetvl_e64m1(2); + if (vl < 2) { + return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF; + } // Construct fold constant vectors: {k1, k2} and {k3, k4} // Each element gets the appropriate constant for its position: @@ -651,10 +655,16 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) { // Load first 64 bytes into 4 vector registers. // Each vector = one 128-bit lane: {lo_64, hi_64} - vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl); - vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl); - vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl); - vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl); + // Use memcpy to avoid strict-aliasing violations when loading uint8_t* as uint64_t* + uint64_t lane1_buf[2], lane2_buf[2], lane3_buf[2], lane4_buf[2]; + memcpy(lane1_buf, p + 0, 16); + memcpy(lane2_buf, p + 16, 16); + memcpy(lane3_buf, p + 32, 16); + memcpy(lane4_buf, p + 48, 16); + vuint64m1_t lane1 = __riscv_vle64_v_u64m1(lane1_buf, vl); + vuint64m1_t lane2 = __riscv_vle64_v_u64m1(lane2_buf, vl); + vuint64m1_t lane3 = __riscv_vle64_v_u64m1(lane3_buf, vl); + vuint64m1_t lane4 = __riscv_vle64_v_u64m1(lane4_buf, vl); // XOR CRC into element 0 of first lane uint64_t tmp[2]; @@ -683,10 +693,15 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) { // scalar extraction for the cross-element XOR since there's no vector // permute instruction for just 2 elements that's more efficient. while (n >= 64) { - vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl); - vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl); - vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl); - vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl); + uint64_t d1_buf[2], d2_buf[2], d3_buf[2], d4_buf[2]; + memcpy(d1_buf, p + 0, 16); + memcpy(d2_buf, p + 16, 16); + memcpy(d3_buf, p + 32, 16); + memcpy(d4_buf, p + 48, 16); + vuint64m1_t d1 = __riscv_vle64_v_u64m1(d1_buf, vl); + vuint64m1_t d2 = __riscv_vle64_v_u64m1(d2_buf, vl); + vuint64m1_t d3 = __riscv_vle64_v_u64m1(d3_buf, vl); + vuint64m1_t d4 = __riscv_vle64_v_u64m1(d4_buf, vl); // Fold each lane using vector clmul with {k1, k2} uint64_t lo_r[2], hi_r[2], d_r[2]; @@ -783,7 +798,7 @@ static bool isZvbc() { if (strstr(line, "isa") || strstr(line, "hart isa")) { char* colon = strchr(line, ':'); if (colon) { - if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) { + if (strstr(colon, "_zvbc")) { supported = true; break; }