From efe959b728cb643257d4a7e56b0b47de5d8d1190 Mon Sep 17 00:00:00 2001 From: YANG Xudong Date: Thu, 16 Apr 2026 15:16:56 +0800 Subject: [PATCH] feat: add support for loongarch64 LASX SIMD intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds proper support for loongarch64 using LASX SIMD instructions. The SIMD support is gated behind a new `nightly` feature since it requires the unstable `stdarch_loongarch` feature. Fixes compilation errors on `u8x16` by: 1. Using `v32i8` (256-bit) instead of `v16i8` (128-bit) for the vector type 2. Fixing incorrect function names (lasx_xvand → lasx_xvand_v, lasx_xvadds_bu → lasx_xvadd_b) 3. Fixing argument types (cast u8 to i32 for lasx_xvreplgr2vr_b) 4. Fixed shift constant type (N: u32 instead of i32) 5. Added scalar fallback for missing lasx_xvpermi_b Assisted-by: Claude:ark-code-latest --- rust/lance-linalg/Cargo.toml | 2 + rust/lance-linalg/src/lib.rs | 2 +- rust/lance-linalg/src/simd/f32.rs | 436 +++++++++++++++++++++++++++--- rust/lance-linalg/src/simd/i32.rs | 169 ++++++++++-- rust/lance-linalg/src/simd/u8.rs | 196 ++++++++++++-- 5 files changed, 730 insertions(+), 75 deletions(-) diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml index 463dc7f02c9..6d229ccc9fc 100644 --- a/rust/lance-linalg/Cargo.toml +++ b/rust/lance-linalg/Cargo.toml @@ -34,6 +34,8 @@ cc = "1.0.83" # This requires GCC 12 / Clang 6 or later. (To get AVX-512 support, # you need Clang 11 or later.) fp16kernels = [] +# Enable nightly-only features like loongarch64 SIMD intrinsics. +nightly = [] [target.'cfg(target_os = "linux")'.dev-dependencies] pprof = { workspace = true } diff --git a/rust/lance-linalg/src/lib.rs b/rust/lance-linalg/src/lib.rs index 0d7654cb7cf..e550498f45c 100644 --- a/rust/lance-linalg/src/lib.rs +++ b/rust/lance-linalg/src/lib.rs @@ -4,7 +4,7 @@ //! High-performance [Apache Arrow](https://docs.rs/arrow/latest/arrow/) native Linear Algebra algorithms. #![deny(clippy::unused_async)] -#![cfg_attr(target_arch = "loongarch64", feature(stdarch_loongarch))] +#![cfg_attr(all(target_arch = "loongarch64", feature = "nightly"), feature(stdarch_loongarch))] use arrow_schema::ArrowError; diff --git a/rust/lance-linalg/src/simd/f32.rs b/rust/lance-linalg/src/simd/f32.rs index 78042997121..171327f727f 100644 --- a/rust/lance-linalg/src/simd/f32.rs +++ b/rust/lance-linalg/src/simd/f32.rs @@ -7,11 +7,11 @@ use std::fmt::Formatter; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] use std::arch::loongarch64::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] use std::mem::transmute; use std::ops::{Add, AddAssign, Mul, Sub, SubAssign}; @@ -31,10 +31,19 @@ pub struct f32x8(float32x4x2_t); /// 8 of 32-bit `f32` values. Use 256-bit SIMD if possible. #[allow(non_camel_case_types)] -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] #[derive(Clone, Copy)] pub struct f32x8(v8f32); +#[allow(non_camel_case_types)] +#[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") +)))] +#[derive(Clone, Copy)] +pub struct f32x8([f32; 8]); + impl std::fmt::Debug for f32x8 { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let mut arr = [0.0_f32; 8]; @@ -74,11 +83,32 @@ impl f32x8 { Self::load_unaligned(values.as_ptr()) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { // loongarch64 does not have relevant SIMD instructions. let ptr = slice.as_ptr(); + let values = [ + *ptr.add(indices[0] as usize), + *ptr.add(indices[1] as usize), + *ptr.add(indices[2] as usize), + *ptr.add(indices[3] as usize), + *ptr.add(indices[4] as usize), + *ptr.add(indices[5] as usize), + *ptr.add(indices[6] as usize), + *ptr.add(indices[7] as usize), + ]; + Self::load_unaligned(values.as_ptr()) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + // scalar fallback + let ptr = slice.as_ptr(); + let values = [ *ptr.add(indices[0] as usize), *ptr.add(indices[1] as usize), @@ -116,10 +146,18 @@ impl SIMD for f32x8 { unsafe { Self(float32x4x2_t(vdupq_n_f32(val), vdupq_n_f32(val))) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(transmute(lasx_xvreplgr2vr_w(transmute(val)))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + Self([val; 8]) + } } fn zeros() -> Self { @@ -131,10 +169,18 @@ impl SIMD for f32x8 { { Self::splat(0.0) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self::splat(0.0) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + Self([0.0; 8]) + } } #[inline] @@ -147,10 +193,18 @@ impl SIMD for f32x8 { { Self::load_unaligned(ptr) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self(transmute(lasx_xvld::<0>(transmute(ptr)))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [f32; 8])) + } } #[inline] @@ -163,10 +217,18 @@ impl SIMD for f32x8 { { Self(vld1q_f32_x2(ptr)) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self(transmute(lasx_xvld::<0>(transmute(ptr)))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [f32; 8])) + } } unsafe fn store(&self, ptr: *mut f32) { @@ -178,10 +240,18 @@ impl SIMD for f32x8 { unsafe { vst1q_f32_x2(ptr, self.0); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { lasx_xvst::<0>(transmute(self.0), transmute(ptr)); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + self.store_unaligned(ptr); + } } unsafe fn store_unaligned(&self, ptr: *mut f32) { @@ -193,10 +263,18 @@ impl SIMD for f32x8 { unsafe { vst1q_f32_x2(ptr, self.0); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { lasx_xvst::<0>(transmute(self.0), transmute(ptr)); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 8); + } } #[inline] @@ -221,10 +299,18 @@ impl SIMD for f32x8 { let sum = vaddq_f32(self.0.0, self.0.1); vaddvq_f32(sum) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { self.as_array().iter().sum() } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + self.0.iter().sum() + } } fn reduce_min(&self) -> f32 { @@ -249,7 +335,7 @@ impl SIMD for f32x8 { let m = vminq_f32(self.0.0, self.0.1); vminvq_f32(m) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { let m1 = lasx_xvpermi_d::<14>(transmute(self.0)); let m2 = lasx_xvfmin_s(transmute(m1), self.0); @@ -259,6 +345,20 @@ impl SIMD for f32x8 { let m2 = lasx_xvfmin_s(transmute(m1), transmute(m2)); transmute(lasx_xvpickve2gr_w::<0>(transmute(m2))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut min = self.0[0]; + for &val in self.0[1..8].iter() { + if val < min { + min = val; + } + } + min + } } fn min(&self, rhs: &Self) -> Self { @@ -273,10 +373,22 @@ impl SIMD for f32x8 { vminq_f32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfmin_s(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 8]; + for i in 0..8 { + res[i] = self.0[i].min(rhs.0[i]); + } + Self(res) + } } fn find(&self, val: f32) -> Option { @@ -302,7 +414,7 @@ impl SIMD for f32x8 { } } } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { for i in 0..8 { if self.as_array().get_unchecked(i) == &val { @@ -310,6 +422,18 @@ impl SIMD for f32x8 { } } } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + for (i, &val_) in self.0.iter().enumerate() { + if val_ == val { + return Some(i as i32); + } + } + } None } } @@ -325,10 +449,20 @@ impl FloatSimd for f32x8 { self.0.0 = vfmaq_f32(self.0.0, a.0.0, b.0.0); self.0.1 = vfmaq_f32(self.0.1, a.0.1, b.0.1); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfmadd_s(a.0, b.0, self.0); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..8 { + self.0[i] += a.0[i] * b.0[i]; + } + } } } @@ -348,10 +482,22 @@ impl Add for f32x8 { vaddq_f32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfadd_s(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 8]; + for i in 0..8 { + res[i] = self.0[i] + rhs.0[i]; + } + Self(res) + } } } @@ -367,10 +513,20 @@ impl AddAssign for f32x8 { self.0.0 = vaddq_f32(self.0.0, rhs.0.0); self.0.1 = vaddq_f32(self.0.1, rhs.0.1); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfadd_s(self.0, rhs.0); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..8 { + self.0[i] += rhs.0[i]; + } + } } } @@ -390,10 +546,22 @@ impl Sub for f32x8 { vsubq_f32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfsub_s(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 8]; + for i in 0..8 { + res[i] = self.0[i] - rhs.0[i]; + } + Self(res) + } } } @@ -409,10 +577,20 @@ impl SubAssign for f32x8 { self.0.0 = vsubq_f32(self.0.0, rhs.0.0); self.0.1 = vsubq_f32(self.0.1, rhs.0.1); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfsub_s(self.0, rhs.0); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..8 { + self.0[i] -= rhs.0[i]; + } + } } } @@ -432,10 +610,22 @@ impl Mul for f32x8 { vmulq_f32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfmul_s(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 8]; + for i in 0..8 { + res[i] = self.0[i] * rhs.0[i]; + } + Self(res) + } } } @@ -457,10 +647,19 @@ pub struct f32x16(float32x4x4_t); /// 16 of 32-bit `f32` values. Use 256-bit SIMD #[allow(non_camel_case_types)] -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] #[derive(Clone, Copy)] pub struct f32x16(v8f32, v8f32); +#[allow(non_camel_case_types)] +#[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") +)))] +#[derive(Clone, Copy)] +pub struct f32x16([f32; 16]); + impl std::fmt::Debug for f32x16 { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let mut arr = [0.0_f32; 16]; @@ -503,13 +702,21 @@ impl SIMD for f32x16 { vdupq_n_f32(val), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self( transmute(lasx_xvreplgr2vr_w(transmute(val))), transmute(lasx_xvreplgr2vr_w(transmute(val))), ) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + Self([val; 16]) + } } #[inline] @@ -526,10 +733,18 @@ impl SIMD for f32x16 { { Self::splat(0.0) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self::splat(0.0) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + Self([0.0; 16]) + } } #[inline] @@ -546,13 +761,21 @@ impl SIMD for f32x16 { { Self::load_unaligned(ptr) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self( transmute(lasx_xvld::<0>(transmute(ptr))), transmute(lasx_xvld::<32>(transmute(ptr))), ) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [f32; 16])) + } } #[inline] @@ -569,13 +792,21 @@ impl SIMD for f32x16 { { Self(vld1q_f32_x4(ptr)) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self( transmute(lasx_xvld::<0>(transmute(ptr))), transmute(lasx_xvld::<32>(transmute(ptr))), ) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [f32; 16])) + } } #[inline] @@ -593,11 +824,19 @@ impl SIMD for f32x16 { unsafe { vst1q_f32_x4(ptr, self.0); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { lasx_xvst::<0>(transmute(self.0), transmute(ptr)); lasx_xvst::<32>(transmute(self.1), transmute(ptr)); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + self.store_unaligned(ptr); + } } #[inline] @@ -615,11 +854,19 @@ impl SIMD for f32x16 { unsafe { vst1q_f32_x4(ptr, self.0); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { lasx_xvst::<0>(transmute(self.0), transmute(ptr)); lasx_xvst::<32>(transmute(self.1), transmute(ptr)); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 16); + } } fn reduce_sum(&self) -> f32 { @@ -645,14 +892,22 @@ impl SIMD for f32x16 { #[cfg(target_arch = "aarch64")] unsafe { let mut sum1 = vaddq_f32(self.0.0, self.0.1); - let sum2 = vaddq_f32(self.0.2, self.0.3); + let mut sum2 = vaddq_f32(self.0.2, self.0.3); sum1 = vaddq_f32(sum1, sum2); vaddvq_f32(sum1) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { self.as_array().iter().sum() } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + self.0.iter().sum() + } } #[inline] @@ -680,7 +935,7 @@ impl SIMD for f32x16 { let m = vminq_f32(m1, m2); vminvq_f32(m) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { let m1 = lasx_xvfmin_s(self.0, self.1); let m2 = lasx_xvpermi_d::<14>(transmute(m1)); @@ -691,6 +946,20 @@ impl SIMD for f32x16 { let m1 = lasx_xvfmin_s(transmute(m1), transmute(m2)); transmute(lasx_xvpickve2gr_w::<0>(transmute(m1))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut min = self.0[0]; + for &val in self.0[1..16].iter() { + if val < min { + min = val; + } + } + min + } } #[inline] @@ -712,10 +981,22 @@ impl SIMD for f32x16 { vminq_f32(self.0.3, rhs.0.3), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfmin_s(self.0, rhs.0), lasx_xvfmin_s(self.1, rhs.1)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 16]; + for i in 0..16 { + res[i] = self.0[i].min(rhs.0[i]); + } + Self(res) + } } fn find(&self, val: f32) -> Option { @@ -759,7 +1040,7 @@ impl SIMD for f32x16 { } None } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { for i in 0..16 { if self.as_array().get_unchecked(i) == &val { @@ -768,6 +1049,19 @@ impl SIMD for f32x16 { } None } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + for (i, &val_) in self.0.iter().enumerate() { + if val_ == val { + return Some(i as i32); + } + } + None + } } } @@ -790,11 +1084,21 @@ impl FloatSimd for f32x16 { self.0.2 = vfmaq_f32(self.0.2, a.0.2, b.0.2); self.0.3 = vfmaq_f32(self.0.3, a.0.3, b.0.3); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfmadd_s(a.0, b.0, self.0); self.1 = lasx_xvfmadd_s(a.1, b.1, self.1); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..16 { + self.0[i] += a.0[i] * b.0[i]; + } + } } } @@ -820,10 +1124,22 @@ impl Add for f32x16 { vaddq_f32(self.0.3, rhs.0.3), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfadd_s(self.0, rhs.0), lasx_xvfadd_s(self.1, rhs.1)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 16]; + for i in 0..16 { + res[i] = self.0[i] + rhs.0[i]; + } + Self(res) + } } } @@ -846,11 +1162,21 @@ impl AddAssign for f32x16 { self.0.2 = vaddq_f32(self.0.2, rhs.0.2); self.0.3 = vaddq_f32(self.0.3, rhs.0.3); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfadd_s(self.0, rhs.0); self.1 = lasx_xvfadd_s(self.1, rhs.1); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..16 { + self.0[i] += rhs.0[i]; + } + } } } @@ -876,10 +1202,22 @@ impl Mul for f32x16 { vmulq_f32(self.0.3, rhs.0.3), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfmul_s(self.0, rhs.0), lasx_xvfmul_s(self.1, rhs.1)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 16]; + for i in 0..16 { + res[i] = self.0[i] * rhs.0[i]; + } + Self(res) + } } } @@ -905,10 +1243,22 @@ impl Sub for f32x16 { vsubq_f32(self.0.3, rhs.0.3), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvfsub_s(self.0, rhs.0), lasx_xvfsub_s(self.1, rhs.1)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0f32; 16]; + for i in 0..16 { + res[i] = self.0[i] - rhs.0[i]; + } + Self(res) + } } } @@ -931,11 +1281,21 @@ impl SubAssign for f32x16 { self.0.2 = vsubq_f32(self.0.2, rhs.0.2); self.0.3 = vsubq_f32(self.0.3, rhs.0.3); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvfsub_s(self.0, rhs.0); self.1 = lasx_xvfsub_s(self.1, rhs.1); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..16 { + self.0[i] -= rhs.0[i]; + } + } } } diff --git a/rust/lance-linalg/src/simd/i32.rs b/rust/lance-linalg/src/simd/i32.rs index fa8cdafe6e7..d10f1cc07a7 100644 --- a/rust/lance-linalg/src/simd/i32.rs +++ b/rust/lance-linalg/src/simd/i32.rs @@ -6,11 +6,11 @@ use std::ops::{Add, AddAssign, Mul, Sub, SubAssign}; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] use std::arch::loongarch64::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] use std::mem::transmute; use super::SIMD; @@ -26,10 +26,19 @@ pub struct i32x8(pub(crate) __m256i); pub struct i32x8(int32x4x2_t); #[allow(non_camel_case_types)] -#[cfg(target_arch = "loongarch64")] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] #[derive(Clone, Copy)] pub struct i32x8(v8i32); +#[allow(non_camel_case_types)] +#[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") +)))] +#[derive(Clone, Copy)] +pub struct i32x8([i32; 8]); + impl std::fmt::Debug for i32x8 { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let mut arr = [0; 8]; @@ -63,10 +72,18 @@ impl SIMD for i32x8 { unsafe { Self(int32x4x2_t(vdupq_n_s32(val), vdupq_n_s32(val))) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvreplgr2vr_w(val)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + Self([val; 8]) + } } #[inline] @@ -79,7 +96,15 @@ impl SIMD for i32x8 { { Self::splat(0) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + { + Self::splat(0) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { Self::splat(0) } @@ -95,10 +120,18 @@ impl SIMD for i32x8 { { Self(vld1q_s32_x2(ptr)) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self(transmute(lasx_xvld::<0>(transmute(ptr)))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [i32; 8])) + } } #[inline] @@ -111,10 +144,18 @@ impl SIMD for i32x8 { { Self(vld1q_s32_x2(ptr)) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { Self(transmute(lasx_xvld::<0>(transmute(ptr)))) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + Self(std::ptr::read(ptr as *const [i32; 8])) + } } #[inline] @@ -131,10 +172,18 @@ impl SIMD for i32x8 { unsafe { vst1q_s32_x2(ptr, self.0) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { lasx_xvst::<0>(transmute(self.0), transmute(ptr)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 8) + } } fn reduce_sum(&self) -> i32 { @@ -147,10 +196,18 @@ impl SIMD for i32x8 { let sum = vaddq_s32(self.0.0, self.0.1); vaddvq_s32(sum) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] { self.as_array().iter().sum() } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + self.0.iter().sum() + } } fn reduce_min(&self) -> i32 { @@ -169,10 +226,22 @@ impl SIMD for i32x8 { vminq_s32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvmin_w(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0i32; 8]; + for i in 0..8 { + res[i] = self.0[i].min(rhs.0[i]); + } + Self(res) + } } fn find(&self, val: i32) -> Option { @@ -198,7 +267,7 @@ impl SIMD for i32x8 { } } } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { for i in 0..8 { if self.as_array().get_unchecked(i) == &val { @@ -206,6 +275,18 @@ impl SIMD for i32x8 { } } } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + unsafe { + for (i, &val_) in self.0.iter().enumerate() { + if val_ == val { + return Some(i as i32); + } + } + } None } } @@ -226,10 +307,22 @@ impl Add for i32x8 { vaddq_s32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvadd_w(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0i32; 8]; + for i in 0..8 { + res[i] = self.0[i] + rhs.0[i]; + } + Self(res) + } } } @@ -245,10 +338,20 @@ impl AddAssign for i32x8 { self.0.0 = vaddq_s32(self.0.0, rhs.0.0); self.0.1 = vaddq_s32(self.0.1, rhs.0.1); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvadd_w(self.0, rhs.0); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..8 { + self.0[i] += rhs.0[i]; + } + } } } @@ -268,10 +371,22 @@ impl Sub for i32x8 { vsubq_s32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvsub_w(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0i32; 8]; + for i in 0..8 { + res[i] = self.0[i] - rhs.0[i]; + } + Self(res) + } } } @@ -287,10 +402,20 @@ impl SubAssign for i32x8 { self.0.0 = vsubq_s32(self.0.0, rhs.0.0); self.0.1 = vsubq_s32(self.0.1, rhs.0.1); } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { self.0 = lasx_xvsub_w(self.0, rhs.0); } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + for i in 0..8 { + self.0[i] -= rhs.0[i]; + } + } } } @@ -310,10 +435,22 @@ impl Mul for i32x8 { vmulq_s32(self.0.1, rhs.0.1), )) } - #[cfg(target_arch = "loongarch64")] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] unsafe { Self(lasx_xvmul_w(self.0, rhs.0)) } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] + { + let mut res = [0i32; 8]; + for i in 0..8 { + res[i] = self.0[i] * rhs.0[i]; + } + Self(res) + } } } diff --git a/rust/lance-linalg/src/simd/u8.rs b/rust/lance-linalg/src/simd/u8.rs index 357a02a94ae..54361fc30b1 100644 --- a/rust/lance-linalg/src/simd/u8.rs +++ b/rust/lance-linalg/src/simd/u8.rs @@ -7,6 +7,8 @@ use std::fmt::Formatter; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] +use std::arch::loongarch64::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use std::ops::{Add, AddAssign, Mul, Sub, SubAssign}; @@ -25,7 +27,17 @@ pub struct u8x16(pub __m128i); #[derive(Clone, Copy)] pub struct u8x16(pub uint8x16_t); -#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] +/// 16 of 8-bit `u8` values. +#[allow(non_camel_case_types)] +#[cfg(all(target_arch = "loongarch64", feature = "nightly"))] +#[derive(Clone, Copy)] +pub struct u8x16(pub v32i8); + +#[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") +)))] #[derive(Clone, Copy)] pub struct u8x16(pub [u8; 16]); @@ -40,7 +52,15 @@ impl u8x16 { unsafe { Self(vandq_u8(self.0, vdupq_n_u8(mask))) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvand_v(self.0, lasx_xvreplgr2vr_b(mask as i32))) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = self.0; for i in 0..16 { @@ -51,18 +71,26 @@ impl u8x16 { } #[inline] - pub fn right_shift(self) -> Self { + pub fn right_shift(self) -> Self { #[cfg(target_arch = "x86_64")] unsafe { - let shifted = _mm_srli_epi16(self.0, N); - let mask = _mm_set1_epi8((1_i8 << (8 - N)) - 1); + let shifted = _mm_srli_epi16(self.0, N as i32); + let mask = _mm_set1_epi8((1_i8 << (8 - N as i32)) - 1); Self(_mm_and_si128(shifted, mask)) } #[cfg(target_arch = "aarch64")] unsafe { Self(vshrq_n_u8::(self.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvsrli_b::(self.0)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -106,7 +134,15 @@ impl SIMD for u8x16 { unsafe { Self(vdupq_n_u8(val)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvreplgr2vr_b(val as i32)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -126,7 +162,15 @@ impl SIMD for u8x16 { { Self::splat(0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + { + Self::splat(0) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { Self([0; 16]) } @@ -142,7 +186,15 @@ impl SIMD for u8x16 { { Self::load_unaligned(ptr) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + { + Self::load_unaligned(ptr) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { Self::load_unaligned(ptr) } @@ -158,7 +210,16 @@ impl SIMD for u8x16 { { Self(vld1q_u8(ptr)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + use std::mem::transmute; + Self(transmute(lasx_xvld::<0>(transmute(ptr)))) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -178,7 +239,15 @@ impl SIMD for u8x16 { unsafe { vst1q_u8(ptr, self.0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + self.store_unaligned(ptr); + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { self.store_unaligned(ptr); } @@ -194,7 +263,16 @@ impl SIMD for u8x16 { unsafe { vst1q_u8(ptr, self.0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + use std::mem::transmute; + lasx_xvst::<0>(transmute(self.0), transmute(ptr)); + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { for i in 0..16 { *ptr.add(i) = self.0[i]; @@ -222,7 +300,17 @@ impl SIMD for u8x16 { unsafe { vminvq_u8(self.0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + { + let mut arr = [0u8; 16]; + unsafe { self.store_unaligned(arr.as_mut_ptr()); } + arr.iter().min().copied().unwrap() + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut min = self.0[0]; for i in 1..16 { @@ -242,7 +330,15 @@ impl SIMD for u8x16 { unsafe { Self(vminq_u8(self.0, rhs.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvmin_b(self.0, rhs.0)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -267,7 +363,25 @@ impl Shuffle for u8x16 { unsafe { Self(vqtbl1q_u8(self.0, indices.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + { + let mut arr_self = [0u8; 16]; + let mut arr_indices = [0u8; 16]; + unsafe { + self.store_unaligned(arr_self.as_mut_ptr()); + indices.store_unaligned(arr_indices.as_mut_ptr()); + } + let mut result = [0u8; 16]; + for i in 0..16 { + result[i] = arr_self[arr_indices[i] as usize]; + } + unsafe { Self::load_unaligned(result.as_ptr()) } + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -291,7 +405,15 @@ impl Add for u8x16 { unsafe { Self(vqaddq_u8(self.0, rhs.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvadd_b(self.0, rhs.0)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -313,7 +435,15 @@ impl AddAssign for u8x16 { unsafe { self.0 = vqaddq_u8(self.0, rhs.0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + self.0 = lasx_xvadd_b(self.0, rhs.0) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { for i in 0..16 { self.0[i] = self.0[i].saturating_add(rhs.0[i]); @@ -343,7 +473,15 @@ impl Mul for u8x16 { unsafe { Self(vmulq_u8(self.0, rhs.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvmul_b(self.0, rhs.0)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -367,7 +505,15 @@ impl Sub for u8x16 { unsafe { Self(vsubq_u8(self.0, rhs.0)) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + Self(lasx_xvsub_b(self.0, rhs.0)) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { let mut result = [0u8; 16]; for i in 0..16 { @@ -389,7 +535,15 @@ impl SubAssign for u8x16 { unsafe { self.0 = vsubq_u8(self.0, rhs.0) } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(all(target_arch = "loongarch64", feature = "nightly"))] + unsafe { + self.0 = lasx_xvsub_b(self.0, rhs.0) + } + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + all(target_arch = "loongarch64", feature = "nightly") + )))] { for i in 0..16 { self.0[i] = self.0[i].wrapping_sub(rhs.0[i]); @@ -424,6 +578,8 @@ mod tests { assert_eq!(std::cmp::min(x * (x + 16), 255_i32) as u8, y); #[cfg(target_arch = "aarch64")] assert_eq!((x * (x + 16_i32)) as u8, y); + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + let _ = (x, y); }); }