Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64"))
# RISC-V specific optimizations
option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for CRC32C acceleration" OFF)
if(WITH_RISCV_ZBC)
option(WITH_RISCV_ZVBC "Enable RISC-V Zvbc vector carry-less multiplication for CRC32C acceleration" OFF)
if(WITH_RISCV_ZVBC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv_zbc_zvbc")
elseif(WITH_RISCV_ZBC)
Comment on lines +177 to +179
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Changed -march=rv64gc_zbc_zvbc to -march=rv64gcv_zbc_zvbc to include the base RVV extension.

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc")
Expand Down
229 changes: 224 additions & 5 deletions src/butil/crc32c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,11 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
return static_cast<uint32_t>(l ^ 0xffffffffu);
}

#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#include <stdio.h>
#if defined(__riscv_zvbc)
#include <riscv_vector.h>
#endif

// RISC-V Zbc carry-less multiplication inline helpers
static inline uint64_t rv_clmul(uint64_t a, uint64_t b) {
Expand Down Expand Up @@ -488,6 +491,7 @@ static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = {

// Hardware-accelerated CRC32C using RISC-V Zbc carry-less multiplication.
// Processes data in 64-byte chunks with 128-bit folding, then Barrett reduces.
#if defined(__riscv_zbc)
static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
// Convert external CRC to internal register state
crc ^= 0xFFFFFFFF;
Expand Down Expand Up @@ -580,6 +584,7 @@ static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
// Convert internal register state to external CRC
return c ^ 0xFFFFFFFF;
}
#endif // __riscv_zbc

// Runtime detection: check if RISC-V CPU supports Zbc extension
static bool isZbc() {
Expand All @@ -592,7 +597,7 @@ static bool isZbc() {
if (strstr(line, "isa") || strstr(line, "hart isa")) {
char* colon = strchr(line, ':');
if (colon) {
if (strstr(colon, "_zbc") || strstr(colon, "zbc")) {
if (strstr(colon, "_zbc")) {
supported = true;
break;
}
Expand All @@ -604,8 +609,210 @@ static bool isZbc() {
}();
return zbc_supported;
}

#if defined(__riscv_zvbc)
// Hardware-accelerated CRC32C using RISC-V Zvbc vector carry-less multiplication.
// Uses RVV vclmul/vclmulh to process 2 lanes per vector operation (VLEN=128).
// With VLEN=128, each vector register holds 2 x 64-bit elements.
// 4 lanes are processed using 2 vector register pairs per clmul step.
static uint32_t rv_crc32c_vclmul(uint32_t crc, const char* buf, size_t len) {
crc ^= 0xFFFFFFFF;

const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
size_t n = len;

if (n < 64) {
return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
}

// Align to 16-byte boundary
uintptr_t mis = (uintptr_t)p & 0xF;
if (mis) {
size_t pre = 16 - mis;
if (pre > n) pre = n;
crc = rv_crc32c_bitwise(crc, p, pre);
p += pre;
n -= pre;
if (n < 64) {
return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
}
}

// Set up RVV for 64-bit elements: vl = min(VLEN/64, 2) = 2 for VLEN=128
// If VLEN < 128, vl will be 1 and the vector path cannot be used; fall back.
size_t vl = __riscv_vsetvl_e64m1(2);
Comment on lines +641 to +643
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Added a vl < 2 check after __riscv_vsetvl_e64m1(2) that falls back to the bitwise path if VLEN < 128.

if (vl < 2) {
return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
}

// Construct fold constant vectors: {k1, k2} and {k3, k4}
// Each element gets the appropriate constant for its position:
// element 0 (lo half) uses k1/k3, element 1 (hi half) uses k2/k4
uint64_t k12_arr[2] = { crc32c_fold_const[0], crc32c_fold_const[1] };
uint64_t k34_arr[2] = { crc32c_fold_const[2], crc32c_fold_const[3] };
vuint64m1_t k12_vec = __riscv_vle64_v_u64m1(k12_arr, vl); // {k1, k2}
vuint64m1_t k34_vec = __riscv_vle64_v_u64m1(k34_arr, vl); // {k3, k4}

// Load first 64 bytes into 4 vector registers.
// Each vector = one 128-bit lane: {lo_64, hi_64}
// Use memcpy to avoid strict-aliasing violations when loading uint8_t* as uint64_t*
uint64_t lane1_buf[2], lane2_buf[2], lane3_buf[2], lane4_buf[2];
memcpy(lane1_buf, p + 0, 16);
memcpy(lane2_buf, p + 16, 16);
memcpy(lane3_buf, p + 32, 16);
memcpy(lane4_buf, p + 48, 16);
vuint64m1_t lane1 = __riscv_vle64_v_u64m1(lane1_buf, vl);
vuint64m1_t lane2 = __riscv_vle64_v_u64m1(lane2_buf, vl);
vuint64m1_t lane3 = __riscv_vle64_v_u64m1(lane3_buf, vl);
vuint64m1_t lane4 = __riscv_vle64_v_u64m1(lane4_buf, vl);

// XOR CRC into element 0 of first lane
uint64_t tmp[2];
__riscv_vse64_v_u64m1(tmp, lane1, vl);
tmp[0] ^= (uint64_t)crc;
lane1 = __riscv_vle64_v_u64m1(tmp, vl);

p += 64;
n -= 64;

// Main loop: fold 64 bytes per iteration using vector carry-less multiply.
//
// For each 128-bit lane {lo, hi}, the fold computes:
// new_lo = clmul(lo, k1) ^ clmul(hi, k2) ^ data_lo
// new_hi = clmulh(lo, k1) ^ clmulh(hi, k2) ^ data_hi
//
// With k12_vec = {k1, k2} and element-wise vclmul:
// vclmul(lane, k12_vec) = {clmul(lo, k1), clmul(hi, k2)} (lo halves of products)
// vclmulh(lane, k12_vec) = {clmulh(lo, k1), clmulh(hi, k2)} (hi halves of products)
//
// The 128-bit XOR of (lo*k1) and (hi*k2) decomposes element-wise:
// new_lo = clmul(lo,k1) ^ clmul(hi,k2) = vclmul[0] ^ vclmul[1]
// new_hi = clmulh(lo,k1) ^ clmulh(hi,k2) = vclmulh[0] ^ vclmulh[1]
//
// So we need to XOR across elements. With VLEN=128 (2 elements), we use
// scalar extraction for the cross-element XOR since there's no vector
// permute instruction for just 2 elements that's more efficient.
while (n >= 64) {
uint64_t d1_buf[2], d2_buf[2], d3_buf[2], d4_buf[2];
memcpy(d1_buf, p + 0, 16);
memcpy(d2_buf, p + 16, 16);
memcpy(d3_buf, p + 32, 16);
memcpy(d4_buf, p + 48, 16);
vuint64m1_t d1 = __riscv_vle64_v_u64m1(d1_buf, vl);
vuint64m1_t d2 = __riscv_vle64_v_u64m1(d2_buf, vl);
vuint64m1_t d3 = __riscv_vle64_v_u64m1(d3_buf, vl);
vuint64m1_t d4 = __riscv_vle64_v_u64m1(d4_buf, vl);

// Fold each lane using vector clmul with {k1, k2}
uint64_t lo_r[2], hi_r[2], d_r[2];

// Lane 1
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane1, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane1, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d1, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane1 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 2
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane2, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane2, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d2, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane2 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 3
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane3, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane3, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d3, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane3 = __riscv_vle64_v_u64m1(d_r, vl);

// Lane 4
__riscv_vse64_v_u64m1(lo_r, __riscv_vclmul_vv_u64m1(lane4, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(hi_r, __riscv_vclmulh_vv_u64m1(lane4, k12_vec, vl), vl);
__riscv_vse64_v_u64m1(d_r, d4, vl);
d_r[0] ^= lo_r[0] ^ lo_r[1];
d_r[1] ^= hi_r[0] ^ hi_r[1];
lane4 = __riscv_vle64_v_u64m1(d_r, vl);

p += 64;
n -= 64;
}

// Reduce 4 lanes to 1 using {k3, k4}
// Same fold pattern: fold lane_a into lane_b
#define FOLD_INTO(dst, src) do { \
uint64_t _lo[2], _hi[2], _d[2]; \
__riscv_vse64_v_u64m1(_lo, __riscv_vclmul_vv_u64m1(src, k34_vec, vl), vl); \
__riscv_vse64_v_u64m1(_hi, __riscv_vclmulh_vv_u64m1(src, k34_vec, vl), vl); \
__riscv_vse64_v_u64m1(_d, dst, vl); \
_d[0] ^= _lo[0] ^ _lo[1]; \
_d[1] ^= _hi[0] ^ _hi[1]; \
dst = __riscv_vle64_v_u64m1(_d, vl); \
} while(0)

FOLD_INTO(lane2, lane1); // lane2 = fold(lane1) ^ lane2
FOLD_INTO(lane3, lane2); // lane3 = fold(lane2) ^ lane3
FOLD_INTO(lane4, lane3); // lane4 = fold(lane3) ^ lane4
#undef FOLD_INTO

// Extract final 128-bit state from vector register
uint64_t final_state[2];
__riscv_vse64_v_u64m1(final_state, lane4, vl);
uint64_t x0 = final_state[0];
uint64_t x1 = final_state[1];

// Barrett reduction: 128-bit -> 32-bit CRC (scalar)
uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1);
uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1);
uint64_t t1 = x1 ^ t4;
t4 = t1 & RV_CRC32_MASK32;
t1 >>= 32;
uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0);
t3 = (t3 << 32) ^ t1 ^ t0;

t4 = t3 & RV_CRC32_MASK32;
t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO);
t4 &= RV_CRC32_MASK32;
t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY);
t4 ^= t3;

uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32);
if (n) {
c = rv_crc32c_bitwise(c, p, n);
}
return c ^ 0xFFFFFFFF;
}

// Runtime detection: check if RISC-V CPU supports Zvbc extension
static bool isZvbc() {
static const bool zvbc_supported = []() {
FILE* f = fopen("/proc/cpuinfo", "r");
if (!f) return false;
bool supported = false;
char line[1024];
while (fgets(line, sizeof(line), f)) {
if (strstr(line, "isa") || strstr(line, "hart isa")) {
char* colon = strchr(line, ':');
if (colon) {
if (strstr(colon, "_zvbc")) {
supported = true;
Comment on lines +798 to +802
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Both isZbc() and isZvbc() now only match _zbc and _zvbc (with underscore prefix), removing the bare zbc/zvbc substring checks that could cause false positives.

break;
}
}
}
}
fclose(f);
return supported;
}();
return zvbc_supported;
}
#endif // __riscv && __riscv_xlen == 64
#endif // __riscv_zvbc

#endif // __riscv && __riscv_xlen == 64 && (__riscv_zbc || __riscv_zvbc)

// Detect if SSE4.2 or not.
#ifdef __SSE4_2__
Expand All @@ -629,10 +836,17 @@ static inline Function Choose_Extend() {
return (Function)ExtendImpl<FastCRC32Functor>;
}
#endif
#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#if defined(__riscv_zvbc)
if (isZvbc()) {
return (Function)rv_crc32c_vclmul;
}
#endif
#if defined(__riscv_zbc)
if (isZbc()) {
return (Function)rv_crc32c_clmul;
}
#endif
#endif
return (Function)ExtendImpl<SlowCRC32Functor>;
}
Expand All @@ -641,8 +855,13 @@ bool IsFastCrc32Supported() {
#ifdef __SSE4_2__
if (isSSE42()) return true;
#endif
#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
#if defined(__riscv) && (__riscv_xlen == 64) && (defined(__riscv_zbc) || defined(__riscv_zvbc))
#if defined(__riscv_zvbc)
if (isZvbc()) return true;
#endif
#if defined(__riscv_zbc)
if (isZbc()) return true;
#endif
#endif
return false;
}
Expand Down
Loading