From 1e17695e25e1d4a7af714a26a65bfad7c19b85eb Mon Sep 17 00:00:00 2001
From: Felix-Gong <gongxiaofei24@iscas.ac.cn>
Date: Fri, 5 Jun 2026 15:03:16 +0000
Subject: [PATCH 1/2] Add RISC-V Zvbc vector CRC32C acceleration

Implement CRC32C using Zvbc vector carry-less multiplication
(vclmul/vclmulh RVV intrinsics). Processes 4 lanes of 128-bit
folding per iteration (64 bytes), with 4-to-1 lane reduction
and Barrett reduction for finalization.

- Add rv_crc32c_vclmul() using vclmul/vclmulh intrinsics
- Add isZvbc() runtime detection via /proc/cpuinfo
- Add WITH_RISCV_ZVBC cmake option
- Fix macro guards: support __riscv_zvbc without __riscv_zbc
- Zvbc preferred over Zbc in Choose_Extend()
---
 CMakeLists.txt      |   5 +-
 src/butil/crc32c.cc | 212 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 212 insertions(+), 5 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e74007b66..3567a6054d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,7 +173,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64"))
         # RISC-V specific optimizations
         option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF)
-        if(WITH_RISCV_ZBC)
+        option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF)
+        if(WITH_RISCV_ZVBC)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc_zvbc")
+        elseif(WITH_RISCV_ZBC)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc")
         else()
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc")
diff --git a/src/butil/crc32c.cc b/src/butil/crc32c.cc
index 7de07cf428..143d6fb0ef 100644
--- a/src/butil/crc32c.cc
+++ b/src/butil/crc32c.cc
@@ -421,8 +421,11 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
   return static_cast<uint32_t>(l ^ 0xffffffffu);
 }
 
-#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
 #include <stdio.h>
+#if defined(__riscv_zvbc)
+#include <riscv_vector.h>
+#endif
 
 // RISC-V Zbc carry-less multiplication inline helpers
 static inline uint64_t rv_clmul(uint64_t a, uint64_t b) {
@@ -488,6 +491,7 @@ static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = {
 
 // Hardware-accelerated CRC32C using RISC-V Zbc carry-less multiplication.
 // Processes data in 64-byte chunks with 128-bit folding, then Barrett reduces.
+#if defined(__riscv_zbc)
 static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
   // Convert external CRC to internal register state
   crc ^= 0xFFFFFFFF;
@@ -580,6 +584,7 @@ static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
   // Convert internal register state to external CRC
   return c ^ 0xFFFFFFFF;
 }
+#endif  // __riscv_zbc
 
 // Runtime detection: check if RISC-V CPU supports Zbc extension
 static bool isZbc() {
@@ -604,8 +609,195 @@ static bool isZbc() {
   }();
   return zbc_supported;
 }
+
+#if defined(__riscv_zvbc)
+// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less multiplication.
+// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128).
+// With VLEN=128, each vector register holds 2 x 64-bit elements.
+// 4 lanes are processed using 2 vector register pairs per clmul step.
+static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
+  crc ^= 0xFFFFFFFF;
+
+  const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
+  size_t n = len;
+
+  if (n < 64) {
+    return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+  }
+
+  // Align to 16-byte boundary
+  uintptr_t mis = (uintptr_t)p & 0xF;
+  if (mis) {
+    size_t pre = 16 - mis;
+    if (pre > n) pre = n;
+    crc = rv_crc32c_bitwise(crc, p, pre);
+    p += pre;
+    n -= pre;
+    if (n < 64) {
+      return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+    }
+  }
+
+  // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
+  size_t vl = __riscv_vsetvl_e64m1(2);
+
+  // Construct fold constant vectors: {k1, k2} and {k3, k4}
+  // Each element gets the appropriate constant for its position:
+  // element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4
+  uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] };
+  uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] };
+  vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl);  // {k1, k2}
+  vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl);  // {k3, k4}
+
+  // Load first 64 bytes into 4 vector registers.
+  // Each vector = one 128-bit lane: {lo_64, hi_64}
+  vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
+  vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
+  vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
+  vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+
+  // XOR CRC into element 0 of first lane
+  uint64_t tmp[2];
+  __riscv_vse64_v_u64m1(tmp, lane1, vl);
+  tmp[0] ^= (uint64_t)crc;
+  lane1 = __riscv_vle64_v_u64m1(tmp, vl);
+
+  p += 64;
+  n -= 64;
+
+  // Main loop: fold 64 bytes per iteration using vector carry-less multiply.
+  //
+  // For each 128-bit lane {lo, hi}, the fold computes:
+  //   new_lo = clmul(lo, k1) ^ clmul(hi, k2) ^ data_lo
+  //   new_hi = clmulh(lo, k1) ^ clmulh(hi, k2) ^ data_hi
+  //
+  // With k12_vec = {k1, k2} and element-wise vclmul:
+  //   vclmul(lane, k12_vec)  = {clmul(lo, k1), clmul(hi, k2)}   (lo halves of products)
+  //   vclmulh(lane, k12_vec) = {clmulh(lo, k1), clmulh(hi, k2)} (hi halves of products)
+  //
+  // The 128-bit XOR of (lo*k1) and (hi*k2) decomposes element-wise:
+  //   new_lo = clmul(lo,k1) ^ clmul(hi,k2) = vclmul[0] ^ vclmul[1]
+  //   new_hi = clmulh(lo,k1) ^ clmulh(hi,k2) = vclmulh[0] ^ vclmulh[1]
+  //
+  // So we need to XOR across elements. With VLEN=128 (2 elements), we use
+  // scalar extraction for the cross-element XOR since there's no vector
+  // permute instruction for just 2 elements that's more efficient.
+  while (n >= 64) {
+    vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
+    vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
+    vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
+    vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+
+    // Fold each lane using vector clmul with {k1, k2}
+    uint64_t lo_r[2], hi_r[2], d_r[2];
+
+    // Lane 1
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane1, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane1, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(d_r, d1, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane1 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 2
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane2, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane2, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(d_r, d2, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane2 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 3
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane3, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane3, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(d_r, d3, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane3 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    // Lane 4
+    __riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane4, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane4, k12_vec, vl), vl);
+    __riscv_vse64_v_u64m1(d_r, d4, vl);
+    d_r[0] ^= lo_r[0] ^ lo_r[1];
+    d_r[1] ^= hi_r[0] ^ hi_r[1];
+    lane4 = __riscv_vle64_v_u64m1(d_r, vl);
+
+    p += 64;
+    n -= 64;
+  }
+
+  // Reduce 4 lanes to 1 using {k3, k4}
+  // Same fold pattern: fold lane_a into lane_b
+  #define FOLD_INTO(dst, src) do { \
+    uint64_t _lo[2], _hi[2], _d[2]; \
+    __riscv_vse64_v_u64m1(_lo, __riscv_vclmul_vv_u64m1(src, k34_vec, vl), vl); \
+    __riscv_vse64_v_u64m1(_hi, __riscv_vclmulh_vv_u64m1(src, k34_vec, vl), vl); \
+    __riscv_vse64_v_u64m1(_d, dst, vl); \
+    _d[0] ^= _lo[0] ^ _lo[1]; \
+    _d[1] ^= _hi[0] ^ _hi[1]; \
+    dst = __riscv_vle64_v_u64m1(_d, vl); \
+  } while(0)
+
+  FOLD_INTO(lane2, lane1);  // lane2 = fold(lane1) ^ lane2
+  FOLD_INTO(lane3, lane2);  // lane3 = fold(lane2) ^ lane3
+  FOLD_INTO(lane4, lane3);  // lane4 = fold(lane3) ^ lane4
+  #undef FOLD_INTO
+
+  // Extract final 128-bit state from vector register
+  uint64_t final_state[2];
+  __riscv_vse64_v_u64m1(final_state, lane4, vl);
+  uint64_t x0 = final_state[0];
+  uint64_t x1 = final_state[1];
+
+  // Barrett reduction: 128-bit -> 32-bit CRC (scalar)
+  uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1);
+  uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1);
+  uint64_t t1 = x1 ^ t4;
+  t4 = t1 & RV_CRC32_MASK32;
+  t1 >>= 32;
+  uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0);
+  t3 = (t3 << 32) ^ t1 ^ t0;
+
+  t4 = t3 & RV_CRC32_MASK32;
+  t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO);
+  t4 &= RV_CRC32_MASK32;
+  t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY);
+  t4 ^= t3;
+
+  uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32);
+  if (n) {
+    c = rv_crc32c_bitwise(c, p, n);
+  }
+  return c ^ 0xFFFFFFFF;
+}
+
+// Runtime detection: check if RISC-V CPU supports Zvbc extension
+static bool isZvbc() {
+  static const bool zvbc_supported = []() {
+    FILE* f = fopen("/proc/cpuinfo", "r");
+    if (!f) return false;
+    bool supported = false;
+    char line[1024];
+    while (fgets(line, sizeof(line), f)) {
+      if (strstr(line, "isa") || strstr(line, "hart isa")) {
+        char* colon = strchr(line, ':');
+        if (colon) {
+          if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) {
+            supported = true;
+            break;
+          }
+        }
+      }
+    }
+    fclose(f);
+    return supported;
+  }();
+  return zvbc_supported;
 }
-#endif  // __riscv && __riscv_xlen == 64
+#endif  // __riscv_zvbc
+
+#endif  // __riscv && __riscv_xlen == 64 && (__riscv_zbc || __riscv_zvbc)
 
 // Detect if SSE4.2 or not.
 #ifdef __SSE4_2__
@@ -629,10 +821,17 @@ static inline Function Choose_Extend() {
     return (Function)ExtendImpl<FastCRC32Functor>;
   }
 #endif
-#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
+#if defined(__riscv_zvbc)
+  if (isZvbc()) {
+    return (Function)rv_crc32c_vclmul;
+  }
+#endif
+#if defined(__riscv_zbc)
   if (isZbc()) {
     return (Function)rv_crc32c_clmul;
   }
+#endif
 #endif
   return (Function)ExtendImpl<SlowCRC32Functor>;
 }
@@ -641,8 +840,13 @@ bool IsFastCrc32Supported() {
 #ifdef __SSE4_2__
   if (isSSE42()) return true;
 #endif
-#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
+#if defined(__riscv_zvbc)
+  if (isZvbc()) return true;
+#endif
+#if defined(__riscv_zbc)
   if (isZbc()) return true;
+#endif
 #endif
   return false;
 }

From b92a25ac6373c92e833cb4dfbee7e68732cf42bf Mon Sep 17 00:00:00 2001
From: Felix-Gong <gongxiaofei24@iscas.ac.cn>
Date: Sat, 6 Jun 2026 04:38:49 +0000
Subject: [PATCH 2/2] Address review comments: fix vl check, aliasing, isa
 matching, march flag

- Add vl<2 fallback for VLEN<128 (prevents UB from uninitialized elements)
- Use memcpy instead of uint8_t*-to-uint64_t* casts (strict-aliasing safe)
- Tighten /proc/cpuinfo ISA matching to _zbc/_zvbc only (prevents _zvbc
  falsely matching zbc)
- Add missing 'v' base extension in -march flag (rv64gcv_zbc_zvbc)
---
 CMakeLists.txt      |  2 +-
 src/butil/crc32c.cc | 35 +++++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3567a6054d..0478ecf898 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,7 +175,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF)
         option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF)
         if(WITH_RISCV_ZVBC)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc_zvbc")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv_zbc_zvbc")
         elseif(WITH_RISCV_ZBC)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc")
         else()
diff --git a/src/butil/crc32c.cc b/src/butil/crc32c.cc
index 143d6fb0ef..1a2fce1a26 100644
--- a/src/butil/crc32c.cc
+++ b/src/butil/crc32c.cc
@@ -597,7 +597,7 @@ static bool isZbc() {
       if (strstr(line, "isa") || strstr(line, "hart isa")) {
         char* colon = strchr(line, ':');
         if (colon) {
-          if (strstr(colon, "_zbc") || strstr(colon, "zbc")) {
+          if (strstr(colon, "_zbc")) {
             supported = true;
             break;
           }
@@ -639,7 +639,11 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
   }
 
   // Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
+  // If VLEN < 128, vl will be 1 and the vector path cannot be used; fall back.
   size_t vl = __riscv_vsetvl_e64m1(2);
+  if (vl < 2) {
+    return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+  }
 
   // Construct fold constant vectors: {k1, k2} and {k3, k4}
   // Each element gets the appropriate constant for its position:
@@ -651,10 +655,16 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
 
   // Load first 64 bytes into 4 vector registers.
   // Each vector = one 128-bit lane: {lo_64, hi_64}
-  vuint64m1_t lane1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
-  vuint64m1_t lane2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
-  vuint64m1_t lane3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
-  vuint64m1_t lane4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+  // Use memcpy to avoid strict-aliasing violations when loading uint8_t* as uint64_t*
+  uint64_t lane1_buf[2], lane2_buf[2], lane3_buf[2], lane4_buf[2];
+  memcpy(lane1_buf, p + 0,  16);
+  memcpy(lane2_buf, p + 16, 16);
+  memcpy(lane3_buf, p + 32, 16);
+  memcpy(lane4_buf, p + 48, 16);
+  vuint64m1_t lane1 = __riscv_vle64_v_u64m1(lane1_buf, vl);
+  vuint64m1_t lane2 = __riscv_vle64_v_u64m1(lane2_buf, vl);
+  vuint64m1_t lane3 = __riscv_vle64_v_u64m1(lane3_buf, vl);
+  vuint64m1_t lane4 = __riscv_vle64_v_u64m1(lane4_buf, vl);
 
   // XOR CRC into element 0 of first lane
   uint64_t tmp[2];
@@ -683,10 +693,15 @@ static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
   // scalar extraction for the cross-element XOR since there's no vector
   // permute instruction for just 2 elements that's more efficient.
   while (n >= 64) {
-    vuint64m1_t d1 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 0), vl);
-    vuint64m1_t d2 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 16), vl);
-    vuint64m1_t d3 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 32), vl);
-    vuint64m1_t d4 = __riscv_vle64_v_u64m1((const uint64_t*)(p + 48), vl);
+    uint64_t d1_buf[2], d2_buf[2], d3_buf[2], d4_buf[2];
+    memcpy(d1_buf, p + 0,  16);
+    memcpy(d2_buf, p + 16, 16);
+    memcpy(d3_buf, p + 32, 16);
+    memcpy(d4_buf, p + 48, 16);
+    vuint64m1_t d1 = __riscv_vle64_v_u64m1(d1_buf, vl);
+    vuint64m1_t d2 = __riscv_vle64_v_u64m1(d2_buf, vl);
+    vuint64m1_t d3 = __riscv_vle64_v_u64m1(d3_buf, vl);
+    vuint64m1_t d4 = __riscv_vle64_v_u64m1(d4_buf, vl);
 
     // Fold each lane using vector clmul with {k1, k2}
     uint64_t lo_r[2], hi_r[2], d_r[2];
@@ -783,7 +798,7 @@ static bool isZvbc() {
       if (strstr(line, "isa") || strstr(line, "hart isa")) {
         char* colon = strchr(line, ':');
         if (colon) {
-          if (strstr(colon, "_zvbc") || strstr(colon, "zvbc")) {
+          if (strstr(colon, "_zvbc")) {
             supported = true;
             break;
           }