From efe959b728cb643257d4a7e56b0b47de5d8d1190 Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Thu, 16 Apr 2026 15:16:56 +0800
Subject: [PATCH] feat: add support for loongarch64 LASX SIMD intrinsics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds proper support for loongarch64 using LASX SIMD instructions.
The SIMD support is gated behind a new `nightly` feature since it requires
the unstable `stdarch_loongarch` feature.

Fixes compilation errors on `u8x16` by:
1. Using `v32i8` (256-bit) instead of `v16i8` (128-bit) for the vector type
2. Fixing incorrect function names (lasx_xvand → lasx_xvand_v, lasx_xvadds_bu → lasx_xvadd_b)
3. Fixing argument types (cast u8 to i32 for lasx_xvreplgr2vr_b)
4. Fixed shift constant type (N: u32 instead of i32)
5. Added scalar fallback for missing lasx_xvpermi_b

Assisted-by: Claude:ark-code-latest
---
 rust/lance-linalg/Cargo.toml      |   2 +
 rust/lance-linalg/src/lib.rs      |   2 +-
 rust/lance-linalg/src/simd/f32.rs | 436 +++++++++++++++++++++++++++---
 rust/lance-linalg/src/simd/i32.rs | 169 ++++++++++--
 rust/lance-linalg/src/simd/u8.rs  | 196 ++++++++++++--
 5 files changed, 730 insertions(+), 75 deletions(-)

diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml
index 463dc7f02c9..6d229ccc9fc 100644
--- a/rust/lance-linalg/Cargo.toml
+++ b/rust/lance-linalg/Cargo.toml
@@ -34,6 +34,8 @@ cc = "1.0.83"
 # This requires GCC 12 / Clang 6 or later. (To get AVX-512 support,
 # you need Clang 11 or later.)
 fp16kernels = []
+# Enable nightly-only features like loongarch64 SIMD intrinsics.
+nightly = []
 
 [target.'cfg(target_os = "linux")'.dev-dependencies]
 pprof = { workspace = true }
diff --git a/rust/lance-linalg/src/lib.rs b/rust/lance-linalg/src/lib.rs
index 0d7654cb7cf..e550498f45c 100644
--- a/rust/lance-linalg/src/lib.rs
+++ b/rust/lance-linalg/src/lib.rs
@@ -4,7 +4,7 @@
 //! High-performance [Apache Arrow](https://docs.rs/arrow/latest/arrow/) native Linear Algebra algorithms.
 
 #![deny(clippy::unused_async)]
-#![cfg_attr(target_arch = "loongarch64", feature(stdarch_loongarch))]
+#![cfg_attr(all(target_arch = "loongarch64", feature = "nightly"), feature(stdarch_loongarch))]
 
 use arrow_schema::ArrowError;
 
diff --git a/rust/lance-linalg/src/simd/f32.rs b/rust/lance-linalg/src/simd/f32.rs
index 78042997121..171327f727f 100644
--- a/rust/lance-linalg/src/simd/f32.rs
+++ b/rust/lance-linalg/src/simd/f32.rs
@@ -7,11 +7,11 @@ use std::fmt::Formatter;
 
 #[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 use std::arch::loongarch64::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 use std::mem::transmute;
 use std::ops::{Add, AddAssign, Mul, Sub, SubAssign};
 
@@ -31,10 +31,19 @@ pub struct f32x8(float32x4x2_t);
 
 /// 8 of 32-bit `f32` values. Use 256-bit SIMD if possible.
 #[allow(non_camel_case_types)]
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 #[derive(Clone, Copy)]
 pub struct f32x8(v8f32);
 
+#[allow(non_camel_case_types)]
+#[cfg(not(any(
+    target_arch = "x86_64",
+    target_arch = "aarch64",
+    all(target_arch = "loongarch64", feature = "nightly")
+)))]
+#[derive(Clone, Copy)]
+pub struct f32x8([f32; 8]);
+
 impl std::fmt::Debug for f32x8 {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let mut arr = [0.0_f32; 8];
@@ -74,11 +83,32 @@ impl f32x8 {
             Self::load_unaligned(values.as_ptr())
         }
 
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             // loongarch64 does not have relevant SIMD instructions.
             let ptr = slice.as_ptr();
 
+            let values = [
+                *ptr.add(indices[0] as usize),
+                *ptr.add(indices[1] as usize),
+                *ptr.add(indices[2] as usize),
+                *ptr.add(indices[3] as usize),
+                *ptr.add(indices[4] as usize),
+                *ptr.add(indices[5] as usize),
+                *ptr.add(indices[6] as usize),
+                *ptr.add(indices[7] as usize),
+            ];
+            Self::load_unaligned(values.as_ptr())
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            // scalar fallback
+            let ptr = slice.as_ptr();
+
             let values = [
                 *ptr.add(indices[0] as usize),
                 *ptr.add(indices[1] as usize),
@@ -116,10 +146,18 @@ impl SIMD<f32, 8> for f32x8 {
         unsafe {
             Self(float32x4x2_t(vdupq_n_f32(val), vdupq_n_f32(val)))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(transmute(lasx_xvreplgr2vr_w(transmute(val))))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            Self([val; 8])
+        }
     }
 
     fn zeros() -> Self {
@@ -131,10 +169,18 @@ impl SIMD<f32, 8> for f32x8 {
         {
             Self::splat(0.0)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self::splat(0.0)
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            Self([0.0; 8])
+        }
     }
 
     #[inline]
@@ -147,10 +193,18 @@ impl SIMD<f32, 8> for f32x8 {
         {
             Self::load_unaligned(ptr)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(transmute(lasx_xvld::<0>(transmute(ptr))))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [f32; 8]))
+        }
     }
 
     #[inline]
@@ -163,10 +217,18 @@ impl SIMD<f32, 8> for f32x8 {
         {
             Self(vld1q_f32_x2(ptr))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(transmute(lasx_xvld::<0>(transmute(ptr))))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [f32; 8]))
+        }
     }
 
     unsafe fn store(&self, ptr: *mut f32) {
@@ -178,10 +240,18 @@ impl SIMD<f32, 8> for f32x8 {
         unsafe {
             vst1q_f32_x2(ptr, self.0);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             lasx_xvst::<0>(transmute(self.0), transmute(ptr));
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            self.store_unaligned(ptr);
+        }
     }
 
     unsafe fn store_unaligned(&self, ptr: *mut f32) {
@@ -193,10 +263,18 @@ impl SIMD<f32, 8> for f32x8 {
         unsafe {
             vst1q_f32_x2(ptr, self.0);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             lasx_xvst::<0>(transmute(self.0), transmute(ptr));
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 8);
+        }
     }
 
     #[inline]
@@ -221,10 +299,18 @@ impl SIMD<f32, 8> for f32x8 {
             let sum = vaddq_f32(self.0.0, self.0.1);
             vaddvq_f32(sum)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             self.as_array().iter().sum()
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            self.0.iter().sum()
+        }
     }
 
     fn reduce_min(&self) -> f32 {
@@ -249,7 +335,7 @@ impl SIMD<f32, 8> for f32x8 {
             let m = vminq_f32(self.0.0, self.0.1);
             vminvq_f32(m)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             let m1 = lasx_xvpermi_d::<14>(transmute(self.0));
             let m2 = lasx_xvfmin_s(transmute(m1), self.0);
@@ -259,6 +345,20 @@ impl SIMD<f32, 8> for f32x8 {
             let m2 = lasx_xvfmin_s(transmute(m1), transmute(m2));
             transmute(lasx_xvpickve2gr_w::<0>(transmute(m2)))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut min = self.0[0];
+            for &val in self.0[1..8].iter() {
+                if val < min {
+                    min = val;
+                }
+            }
+            min
+        }
     }
 
     fn min(&self, rhs: &Self) -> Self {
@@ -273,10 +373,22 @@ impl SIMD<f32, 8> for f32x8 {
                 vminq_f32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfmin_s(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i].min(rhs.0[i]);
+            }
+            Self(res)
+        }
     }
 
     fn find(&self, val: f32) -> Option<i32> {
@@ -302,7 +414,7 @@ impl SIMD<f32, 8> for f32x8 {
                 }
             }
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             for i in 0..8 {
                 if self.as_array().get_unchecked(i) == &val {
@@ -310,6 +422,18 @@ impl SIMD<f32, 8> for f32x8 {
                 }
             }
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            for (i, &val_) in self.0.iter().enumerate() {
+                if val_ == val {
+                    return Some(i as i32);
+                }
+            }
+        }
         None
     }
 }
@@ -325,10 +449,20 @@ impl FloatSimd<f32, 8> for f32x8 {
             self.0.0 = vfmaq_f32(self.0.0, a.0.0, b.0.0);
             self.0.1 = vfmaq_f32(self.0.1, a.0.1, b.0.1);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfmadd_s(a.0, b.0, self.0);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..8 {
+                self.0[i] += a.0[i] * b.0[i];
+            }
+        }
     }
 }
 
@@ -348,10 +482,22 @@ impl Add for f32x8 {
                 vaddq_f32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfadd_s(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] + rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -367,10 +513,20 @@ impl AddAssign for f32x8 {
             self.0.0 = vaddq_f32(self.0.0, rhs.0.0);
             self.0.1 = vaddq_f32(self.0.1, rhs.0.1);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfadd_s(self.0, rhs.0);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..8 {
+                self.0[i] += rhs.0[i];
+            }
+        }
     }
 }
 
@@ -390,10 +546,22 @@ impl Sub for f32x8 {
                 vsubq_f32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfsub_s(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] - rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -409,10 +577,20 @@ impl SubAssign for f32x8 {
             self.0.0 = vsubq_f32(self.0.0, rhs.0.0);
             self.0.1 = vsubq_f32(self.0.1, rhs.0.1);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfsub_s(self.0, rhs.0);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..8 {
+                self.0[i] -= rhs.0[i];
+            }
+        }
     }
 }
 
@@ -432,10 +610,22 @@ impl Mul for f32x8 {
                 vmulq_f32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfmul_s(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] * rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -457,10 +647,19 @@ pub struct f32x16(float32x4x4_t);
 
 /// 16 of 32-bit `f32` values. Use 256-bit SIMD
 #[allow(non_camel_case_types)]
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 #[derive(Clone, Copy)]
 pub struct f32x16(v8f32, v8f32);
 
+#[allow(non_camel_case_types)]
+#[cfg(not(any(
+    target_arch = "x86_64",
+    target_arch = "aarch64",
+    all(target_arch = "loongarch64", feature = "nightly")
+)))]
+#[derive(Clone, Copy)]
+pub struct f32x16([f32; 16]);
+
 impl std::fmt::Debug for f32x16 {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let mut arr = [0.0_f32; 16];
@@ -503,13 +702,21 @@ impl SIMD<f32, 16> for f32x16 {
                 vdupq_n_f32(val),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(
                 transmute(lasx_xvreplgr2vr_w(transmute(val))),
                 transmute(lasx_xvreplgr2vr_w(transmute(val))),
             )
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            Self([val; 16])
+        }
     }
 
     #[inline]
@@ -526,10 +733,18 @@ impl SIMD<f32, 16> for f32x16 {
         {
             Self::splat(0.0)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self::splat(0.0)
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            Self([0.0; 16])
+        }
     }
 
     #[inline]
@@ -546,13 +761,21 @@ impl SIMD<f32, 16> for f32x16 {
         {
             Self::load_unaligned(ptr)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(
                 transmute(lasx_xvld::<0>(transmute(ptr))),
                 transmute(lasx_xvld::<32>(transmute(ptr))),
             )
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [f32; 16]))
+        }
     }
 
     #[inline]
@@ -569,13 +792,21 @@ impl SIMD<f32, 16> for f32x16 {
         {
             Self(vld1q_f32_x4(ptr))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(
                 transmute(lasx_xvld::<0>(transmute(ptr))),
                 transmute(lasx_xvld::<32>(transmute(ptr))),
             )
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [f32; 16]))
+        }
     }
 
     #[inline]
@@ -593,11 +824,19 @@ impl SIMD<f32, 16> for f32x16 {
         unsafe {
             vst1q_f32_x4(ptr, self.0);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             lasx_xvst::<0>(transmute(self.0), transmute(ptr));
             lasx_xvst::<32>(transmute(self.1), transmute(ptr));
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            self.store_unaligned(ptr);
+        }
     }
 
     #[inline]
@@ -615,11 +854,19 @@ impl SIMD<f32, 16> for f32x16 {
         unsafe {
             vst1q_f32_x4(ptr, self.0);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             lasx_xvst::<0>(transmute(self.0), transmute(ptr));
             lasx_xvst::<32>(transmute(self.1), transmute(ptr));
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 16);
+        }
     }
 
     fn reduce_sum(&self) -> f32 {
@@ -645,14 +892,22 @@ impl SIMD<f32, 16> for f32x16 {
         #[cfg(target_arch = "aarch64")]
         unsafe {
             let mut sum1 = vaddq_f32(self.0.0, self.0.1);
-            let sum2 = vaddq_f32(self.0.2, self.0.3);
+            let mut sum2 = vaddq_f32(self.0.2, self.0.3);
             sum1 = vaddq_f32(sum1, sum2);
             vaddvq_f32(sum1)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             self.as_array().iter().sum()
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            self.0.iter().sum()
+        }
     }
 
     #[inline]
@@ -680,7 +935,7 @@ impl SIMD<f32, 16> for f32x16 {
             let m = vminq_f32(m1, m2);
             vminvq_f32(m)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             let m1 = lasx_xvfmin_s(self.0, self.1);
             let m2 = lasx_xvpermi_d::<14>(transmute(m1));
@@ -691,6 +946,20 @@ impl SIMD<f32, 16> for f32x16 {
             let m1 = lasx_xvfmin_s(transmute(m1), transmute(m2));
             transmute(lasx_xvpickve2gr_w::<0>(transmute(m1)))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut min = self.0[0];
+            for &val in self.0[1..16].iter() {
+                if val < min {
+                    min = val;
+                }
+            }
+            min
+        }
     }
 
     #[inline]
@@ -712,10 +981,22 @@ impl SIMD<f32, 16> for f32x16 {
                 vminq_f32(self.0.3, rhs.0.3),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfmin_s(self.0, rhs.0), lasx_xvfmin_s(self.1, rhs.1))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 16];
+            for i in 0..16 {
+                res[i] = self.0[i].min(rhs.0[i]);
+            }
+            Self(res)
+        }
     }
 
     fn find(&self, val: f32) -> Option<i32> {
@@ -759,7 +1040,7 @@ impl SIMD<f32, 16> for f32x16 {
             }
             None
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             for i in 0..16 {
                 if self.as_array().get_unchecked(i) == &val {
@@ -768,6 +1049,19 @@ impl SIMD<f32, 16> for f32x16 {
             }
             None
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            for (i, &val_) in self.0.iter().enumerate() {
+                if val_ == val {
+                    return Some(i as i32);
+                }
+            }
+            None
+        }
     }
 }
 
@@ -790,11 +1084,21 @@ impl FloatSimd<f32, 16> for f32x16 {
             self.0.2 = vfmaq_f32(self.0.2, a.0.2, b.0.2);
             self.0.3 = vfmaq_f32(self.0.3, a.0.3, b.0.3);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfmadd_s(a.0, b.0, self.0);
             self.1 = lasx_xvfmadd_s(a.1, b.1, self.1);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..16 {
+                self.0[i] += a.0[i] * b.0[i];
+            }
+        }
     }
 }
 
@@ -820,10 +1124,22 @@ impl Add for f32x16 {
                 vaddq_f32(self.0.3, rhs.0.3),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfadd_s(self.0, rhs.0), lasx_xvfadd_s(self.1, rhs.1))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 16];
+            for i in 0..16 {
+                res[i] = self.0[i] + rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -846,11 +1162,21 @@ impl AddAssign for f32x16 {
             self.0.2 = vaddq_f32(self.0.2, rhs.0.2);
             self.0.3 = vaddq_f32(self.0.3, rhs.0.3);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfadd_s(self.0, rhs.0);
             self.1 = lasx_xvfadd_s(self.1, rhs.1);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..16 {
+                self.0[i] += rhs.0[i];
+            }
+        }
     }
 }
 
@@ -876,10 +1202,22 @@ impl Mul for f32x16 {
                 vmulq_f32(self.0.3, rhs.0.3),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfmul_s(self.0, rhs.0), lasx_xvfmul_s(self.1, rhs.1))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 16];
+            for i in 0..16 {
+                res[i] = self.0[i] * rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -905,10 +1243,22 @@ impl Sub for f32x16 {
                 vsubq_f32(self.0.3, rhs.0.3),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvfsub_s(self.0, rhs.0), lasx_xvfsub_s(self.1, rhs.1))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0f32; 16];
+            for i in 0..16 {
+                res[i] = self.0[i] - rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -931,11 +1281,21 @@ impl SubAssign for f32x16 {
             self.0.2 = vsubq_f32(self.0.2, rhs.0.2);
             self.0.3 = vsubq_f32(self.0.3, rhs.0.3);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvfsub_s(self.0, rhs.0);
             self.1 = lasx_xvfsub_s(self.1, rhs.1);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..16 {
+                self.0[i] -= rhs.0[i];
+            }
+        }
     }
 }
 
diff --git a/rust/lance-linalg/src/simd/i32.rs b/rust/lance-linalg/src/simd/i32.rs
index fa8cdafe6e7..d10f1cc07a7 100644
--- a/rust/lance-linalg/src/simd/i32.rs
+++ b/rust/lance-linalg/src/simd/i32.rs
@@ -6,11 +6,11 @@ use std::ops::{Add, AddAssign, Mul, Sub, SubAssign};
 
 #[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 use std::arch::loongarch64::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 use std::mem::transmute;
 
 use super::SIMD;
@@ -26,10 +26,19 @@ pub struct i32x8(pub(crate) __m256i);
 pub struct i32x8(int32x4x2_t);
 
 #[allow(non_camel_case_types)]
-#[cfg(target_arch = "loongarch64")]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
 #[derive(Clone, Copy)]
 pub struct i32x8(v8i32);
 
+#[allow(non_camel_case_types)]
+#[cfg(not(any(
+    target_arch = "x86_64",
+    target_arch = "aarch64",
+    all(target_arch = "loongarch64", feature = "nightly")
+)))]
+#[derive(Clone, Copy)]
+pub struct i32x8([i32; 8]);
+
 impl std::fmt::Debug for i32x8 {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let mut arr = [0; 8];
@@ -63,10 +72,18 @@ impl SIMD<i32, 8> for i32x8 {
         unsafe {
             Self(int32x4x2_t(vdupq_n_s32(val), vdupq_n_s32(val)))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvreplgr2vr_w(val))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            Self([val; 8])
+        }
     }
 
     #[inline]
@@ -79,7 +96,15 @@ impl SIMD<i32, 8> for i32x8 {
         {
             Self::splat(0)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        {
+            Self::splat(0)
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             Self::splat(0)
         }
@@ -95,10 +120,18 @@ impl SIMD<i32, 8> for i32x8 {
         {
             Self(vld1q_s32_x2(ptr))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(transmute(lasx_xvld::<0>(transmute(ptr))))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [i32; 8]))
+        }
     }
 
     #[inline]
@@ -111,10 +144,18 @@ impl SIMD<i32, 8> for i32x8 {
         {
             Self(vld1q_s32_x2(ptr))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             Self(transmute(lasx_xvld::<0>(transmute(ptr))))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            Self(std::ptr::read(ptr as *const [i32; 8]))
+        }
     }
 
     #[inline]
@@ -131,10 +172,18 @@ impl SIMD<i32, 8> for i32x8 {
         unsafe {
             vst1q_s32_x2(ptr, self.0)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             lasx_xvst::<0>(transmute(self.0), transmute(ptr))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            std::ptr::copy_nonoverlapping(self.0.as_ptr(), ptr, 8)
+        }
     }
 
     fn reduce_sum(&self) -> i32 {
@@ -147,10 +196,18 @@ impl SIMD<i32, 8> for i32x8 {
             let sum = vaddq_s32(self.0.0, self.0.1);
             vaddvq_s32(sum)
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         {
             self.as_array().iter().sum()
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            self.0.iter().sum()
+        }
     }
 
     fn reduce_min(&self) -> i32 {
@@ -169,10 +226,22 @@ impl SIMD<i32, 8> for i32x8 {
                 vminq_s32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvmin_w(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0i32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i].min(rhs.0[i]);
+            }
+            Self(res)
+        }
     }
 
     fn find(&self, val: i32) -> Option<i32> {
@@ -198,7 +267,7 @@ impl SIMD<i32, 8> for i32x8 {
                 }
             }
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             for i in 0..8 {
                 if self.as_array().get_unchecked(i) == &val {
@@ -206,6 +275,18 @@ impl SIMD<i32, 8> for i32x8 {
                 }
             }
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        unsafe {
+            for (i, &val_) in self.0.iter().enumerate() {
+                if val_ == val {
+                    return Some(i as i32);
+                }
+            }
+        }
         None
     }
 }
@@ -226,10 +307,22 @@ impl Add for i32x8 {
                 vaddq_s32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvadd_w(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0i32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] + rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -245,10 +338,20 @@ impl AddAssign for i32x8 {
             self.0.0 = vaddq_s32(self.0.0, rhs.0.0);
             self.0.1 = vaddq_s32(self.0.1, rhs.0.1);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvadd_w(self.0, rhs.0);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..8 {
+                self.0[i] += rhs.0[i];
+            }
+        }
     }
 }
 
@@ -268,10 +371,22 @@ impl Sub for i32x8 {
                 vsubq_s32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvsub_w(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0i32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] - rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
@@ -287,10 +402,20 @@ impl SubAssign for i32x8 {
             self.0.0 = vsubq_s32(self.0.0, rhs.0.0);
             self.0.1 = vsubq_s32(self.0.1, rhs.0.1);
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             self.0 = lasx_xvsub_w(self.0, rhs.0);
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            for i in 0..8 {
+                self.0[i] -= rhs.0[i];
+            }
+        }
     }
 }
 
@@ -310,10 +435,22 @@ impl Mul for i32x8 {
                 vmulq_s32(self.0.1, rhs.0.1),
             ))
         }
-        #[cfg(target_arch = "loongarch64")]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
         unsafe {
             Self(lasx_xvmul_w(self.0, rhs.0))
         }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
+        {
+            let mut res = [0i32; 8];
+            for i in 0..8 {
+                res[i] = self.0[i] * rhs.0[i];
+            }
+            Self(res)
+        }
     }
 }
 
diff --git a/rust/lance-linalg/src/simd/u8.rs b/rust/lance-linalg/src/simd/u8.rs
index 357a02a94ae..54361fc30b1 100644
--- a/rust/lance-linalg/src/simd/u8.rs
+++ b/rust/lance-linalg/src/simd/u8.rs
@@ -7,6 +7,8 @@ use std::fmt::Formatter;
 
 #[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+use std::arch::loongarch64::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 use std::ops::{Add, AddAssign, Mul, Sub, SubAssign};
@@ -25,7 +27,17 @@ pub struct u8x16(pub __m128i);
 #[derive(Clone, Copy)]
 pub struct u8x16(pub uint8x16_t);
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+/// 16 of 8-bit `u8` values.
+#[allow(non_camel_case_types)]
+#[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+#[derive(Clone, Copy)]
+pub struct u8x16(pub v32i8);
+
+#[cfg(not(any(
+    target_arch = "x86_64",
+    target_arch = "aarch64",
+    all(target_arch = "loongarch64", feature = "nightly")
+)))]
 #[derive(Clone, Copy)]
 pub struct u8x16(pub [u8; 16]);
 
@@ -40,7 +52,15 @@ impl u8x16 {
         unsafe {
             Self(vandq_u8(self.0, vdupq_n_u8(mask)))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvand_v(self.0, lasx_xvreplgr2vr_b(mask as i32)))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = self.0;
             for i in 0..16 {
@@ -51,18 +71,26 @@ impl u8x16 {
     }
 
     #[inline]
-    pub fn right_shift<const N: i32>(self) -> Self {
+    pub fn right_shift<const N: u32>(self) -> Self {
         #[cfg(target_arch = "x86_64")]
         unsafe {
-            let shifted = _mm_srli_epi16(self.0, N);
-            let mask = _mm_set1_epi8((1_i8 << (8 - N)) - 1);
+            let shifted = _mm_srli_epi16(self.0, N as i32);
+            let mask = _mm_set1_epi8((1_i8 << (8 - N as i32)) - 1);
             Self(_mm_and_si128(shifted, mask))
         }
         #[cfg(target_arch = "aarch64")]
         unsafe {
             Self(vshrq_n_u8::<N>(self.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvsrli_b::<N>(self.0))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -106,7 +134,15 @@ impl SIMD<u8, 16> for u8x16 {
         unsafe {
             Self(vdupq_n_u8(val))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvreplgr2vr_b(val as i32))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -126,7 +162,15 @@ impl SIMD<u8, 16> for u8x16 {
         {
             Self::splat(0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        {
+            Self::splat(0)
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             Self([0; 16])
         }
@@ -142,7 +186,15 @@ impl SIMD<u8, 16> for u8x16 {
         {
             Self::load_unaligned(ptr)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        {
+            Self::load_unaligned(ptr)
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             Self::load_unaligned(ptr)
         }
@@ -158,7 +210,16 @@ impl SIMD<u8, 16> for u8x16 {
         {
             Self(vld1q_u8(ptr))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            use std::mem::transmute;
+            Self(transmute(lasx_xvld::<0>(transmute(ptr))))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -178,7 +239,15 @@ impl SIMD<u8, 16> for u8x16 {
         unsafe {
             vst1q_u8(ptr, self.0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            self.store_unaligned(ptr);
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             self.store_unaligned(ptr);
         }
@@ -194,7 +263,16 @@ impl SIMD<u8, 16> for u8x16 {
         unsafe {
             vst1q_u8(ptr, self.0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            use std::mem::transmute;
+            lasx_xvst::<0>(transmute(self.0), transmute(ptr));
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             for i in 0..16 {
                 *ptr.add(i) = self.0[i];
@@ -222,7 +300,17 @@ impl SIMD<u8, 16> for u8x16 {
         unsafe {
             vminvq_u8(self.0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        {
+            let mut arr = [0u8; 16];
+            unsafe { self.store_unaligned(arr.as_mut_ptr()); }
+            arr.iter().min().copied().unwrap()
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut min = self.0[0];
             for i in 1..16 {
@@ -242,7 +330,15 @@ impl SIMD<u8, 16> for u8x16 {
         unsafe {
             Self(vminq_u8(self.0, rhs.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvmin_b(self.0, rhs.0))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -267,7 +363,25 @@ impl Shuffle for u8x16 {
         unsafe {
             Self(vqtbl1q_u8(self.0, indices.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        {
+            let mut arr_self = [0u8; 16];
+            let mut arr_indices = [0u8; 16];
+            unsafe {
+                self.store_unaligned(arr_self.as_mut_ptr());
+                indices.store_unaligned(arr_indices.as_mut_ptr());
+            }
+            let mut result = [0u8; 16];
+            for i in 0..16 {
+                result[i] = arr_self[arr_indices[i] as usize];
+            }
+            unsafe { Self::load_unaligned(result.as_ptr()) }
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -291,7 +405,15 @@ impl Add for u8x16 {
         unsafe {
             Self(vqaddq_u8(self.0, rhs.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvadd_b(self.0, rhs.0))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -313,7 +435,15 @@ impl AddAssign for u8x16 {
         unsafe {
             self.0 = vqaddq_u8(self.0, rhs.0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            self.0 = lasx_xvadd_b(self.0, rhs.0)
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             for i in 0..16 {
                 self.0[i] = self.0[i].saturating_add(rhs.0[i]);
@@ -343,7 +473,15 @@ impl Mul for u8x16 {
         unsafe {
             Self(vmulq_u8(self.0, rhs.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvmul_b(self.0, rhs.0))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -367,7 +505,15 @@ impl Sub for u8x16 {
         unsafe {
             Self(vsubq_u8(self.0, rhs.0))
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            Self(lasx_xvsub_b(self.0, rhs.0))
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             let mut result = [0u8; 16];
             for i in 0..16 {
@@ -389,7 +535,15 @@ impl SubAssign for u8x16 {
         unsafe {
             self.0 = vsubq_u8(self.0, rhs.0)
         }
-        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        #[cfg(all(target_arch = "loongarch64", feature = "nightly"))]
+        unsafe {
+            self.0 = lasx_xvsub_b(self.0, rhs.0)
+        }
+        #[cfg(not(any(
+            target_arch = "x86_64",
+            target_arch = "aarch64",
+            all(target_arch = "loongarch64", feature = "nightly")
+        )))]
         {
             for i in 0..16 {
                 self.0[i] = self.0[i].wrapping_sub(rhs.0[i]);
@@ -424,6 +578,8 @@ mod tests {
             assert_eq!(std::cmp::min(x * (x + 16), 255_i32) as u8, y);
             #[cfg(target_arch = "aarch64")]
             assert_eq!((x * (x + 16_i32)) as u8, y);
+            #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+            let _ = (x, y);
         });
     }