LDeakin · LDeakin · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -32,6 +32,7 @@ num-traits = ["dep:num-traits"]
 rand_distr = ["dep:rand", "dep:rand_distr"]
 
 [dev-dependencies]
+float8 = "0.7.0"
 proptest = { version = "1.11", default-features = false, features = ["std"] }
 
 [target.'cfg(target_arch = "wasm32")'.dependencies]

diff --git a/README.md b/README.md
@@ -50,6 +50,15 @@ This crate provides [`no_std`](https://rust-embedded.github.io/book/intro/no-std
 
 See the [crate documentation](https://docs.rs/microfloat/) for more details.
 
+## Related Crates
+
+### `float8`
+
+The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant.
+They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)).
+In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities.
+In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow).
+
 ### Optional Features
 
 - **`serde`** - Implement `Serialize` and `Deserialize` traits for the float

diff --git a/src/lib.rs b/src/lib.rs
@@ -79,6 +79,15 @@
 //!
 //! Compatibility with `ml-dtypes` is tested by generated fixtures in `tests/fixtures/`.
 //! These fixtures validate conversions, classifications, arithmetic, and math methods.
+//!
+//! ## Related Crates
+//!
+//! ### `float8`
+//!
+//! The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant.
+//! They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)).
+//! In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities.
+//! In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow).
 
 mod bits;
 mod format;

diff --git a/tests/float8_e4m3_errant.rs b/tests/float8_e4m3_errant.rs
@@ -0,0 +1,107 @@
+#![allow(clippy::float_cmp)]
+use float8::F8E4M3;
+use microfloat::f8e4m3fn;
+use std::num::FpCategory;
+
+// ============================================================================
+// float8 F8E4M3 behavior vs microfloat f8e4m3fn behavior
+//
+// float8::F8E4M3 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp.
+//
+// In NVIDIA's model, __nv_saturation_t controls overflow behavior:
+//   - __NV_SATFINITE: overflow clamps to max finite value
+//   - __NV_NOSAT: overflow produces IEEE NaN or Inf
+//
+// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity.
+// microfloat implements __NV_NOSAT (IEEE / OCP compliant) instead.
+//
+// float8 also has classify() bug where Zero and NaN are considered Subnormal
+// ============================================================================
+
+// E4M3 bit layout: S.EEEE.MMM (1 + 4 + 3 = 8 bits)
+const fn e4m3_bits(sign: u8, exp: u8, mant: u8) -> u8 {
+    (sign << 7) | ((exp & 0xF) << 3) | (mant & 0x7)
+}
+
+#[test]
+fn float8_f8e4m3_classify_zero() {
+    // float8 errant: classify() returns Subnormal for Zero
+    assert_eq!(F8E4M3::ZERO.classify(), FpCategory::Subnormal);
+    assert_eq!(F8E4M3::NEG_ZERO.classify(), FpCategory::Subnormal);
+    // microfloat correct: classify() returns Zero
+    assert_eq!(f8e4m3fn::ZERO.classify(), FpCategory::Zero);
+    assert_eq!(f8e4m3fn::NEG_ZERO.classify(), FpCategory::Zero);
+}
+
+#[test]
+fn float8_f8e4m3_classify_nan() {
+    // float8 errant: classify() returns Subnormal for NaN
+    assert_eq!(F8E4M3::NAN.classify(), FpCategory::Subnormal);
+    assert!(F8E4M3::NAN.is_nan());
+    assert!(!F8E4M3::NAN.is_finite());
+    assert!(!F8E4M3::NAN.is_normal());
+    // float8 classify() checks !is_normal() before is_nan()
+    // microfloat correct: classify() returns Nan
+    assert_eq!(f8e4m3fn::NAN.classify(), FpCategory::Nan);
+    assert!(f8e4m3fn::NAN.is_nan());
+}
+
+#[test]
+fn float8_f8e4m3_satfinite_infinity_constant() {
+    // NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7E = exp=15, mant=6 = 448.0
+    // This is the overflow sentinel (not a true infinity).
+    // See cuda_fp8.hpp: FP8_MAXNORM = 0x7E; res = FP8_MAXNORM;
+    //
+    // IEEE 754 E4M3: INFINITY = exp=15, mant=0 = e4m3_bits(0, 15, 0)
+    // OCP E4M3: no infinity (N/A)
+    assert_eq!(F8E4M3::INFINITY.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
+    assert_eq!(F8E4M3::INFINITY.to_f32(), 448.0);
+    assert!(F8E4M3::INFINITY.is_infinite());
+    let inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b110));
+    assert!(inf.is_infinite());
+    assert_eq!(inf.to_f32(), 448.0);
+    // microfloat (__NV_NOSAT / OCP): no infinity, 448.0 is finite
+    assert!(!f8e4m3fn::has_inf());
+    let val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b110));
+    assert_eq!(val.to_f32(), 448.0);
+    assert!(val.is_finite());
+    assert!(!val.is_infinite());
+}
+
+#[test]
+fn float8_f8e4m3_max_448() {
+    // __NV_SATFINITE: from_f32(448.0) clamps to FP8_MAXNORM = 0x7E
+    // which is the same as the "INFINITY" constant.
+    let val = F8E4M3::from_f32(448.0);
+    assert_eq!(val.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
+    assert_eq!(val.to_bits(), F8E4M3::INFINITY.to_bits());
+    assert!(val.is_infinite());
+    assert_eq!(val.to_f32(), 448.0);
+    // microfloat: MAX = 448.0 is finite (__NV_NOSAT / OCP compliant)
+    assert_eq!(f8e4m3fn::MAX.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
+    assert_eq!(f8e4m3fn::MAX.to_f32(), 448.0);
+    assert!(f8e4m3fn::MAX.is_finite());
+    assert!(!f8e4m3fn::MAX.is_infinite());
+}
+
+#[test]
+fn float8_f8e4m3_overflow_to_nan() {
+    // __NV_NOSAT / OCP E4M3: overflow maps to NaN (no infinity)
+    assert!(f8e4m3fn::from_f32(500.0).to_f32().is_nan());
+    assert!(f8e4m3fn::from_f32(f32::INFINITY).to_f32().is_nan());
+    assert!(f8e4m3fn::from_f32(f32::NEG_INFINITY).to_f32().is_nan());
+}
+
+#[test]
+fn float8_f8e4m3_ieee_infinity_not_recognized() {
+    // IEEE 754 E4M3 infinity: exp=15, mant=0 = e4m3_bits(0, 15, 0)
+    // Neither float8 (__NV_SATFINITE) nor microfloat (__NV_NOSAT) use
+    // e4m3_bits(0, 15, 0) as infinity.
+    let ieee_inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b000));
+    assert!(!ieee_inf.is_infinite());
+    assert_eq!(ieee_inf.to_f32(), 256.0);
+    // microfloat: e4m3_bits(0, 15, 0) decodes as 256.0, which is correct
+    let mf_val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b000));
+    assert_eq!(mf_val.to_f32(), 256.0);
+    assert!(mf_val.is_finite());
+}
diff --git a/tests/float8_e5m2_errant.rs b/tests/float8_e5m2_errant.rs
@@ -0,0 +1,100 @@
+#![allow(clippy::float_cmp)]
+use float8::F8E5M2;
+use microfloat::f8e5m2;
+use std::num::FpCategory;
+
+// ============================================================================
+// float8 F8E5M2 behavior vs microfloat f8e5m2 behavior
+//
+// float8::F8E5M2 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp.
+//
+// In NVIDIA's model, __nv_saturation_t controls overflow behavior:
+//   - __NV_SATFINITE: overflow clamps to max finite value
+//   - __NV_NOSAT: overflow produces IEEE NaN or Inf
+//
+// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity.
+// microfloat implements __NV_NOSAT (IEEE compliant) instead.
+//
+// float8 also has classify() bug where Zero and NaN are considered Subnormal
+// ============================================================================
+
+// E5M2 bit layout: S.EEEEE.MM (1 + 5 + 2 = 8 bits)
+const fn e5m2_bits(sign: u8, exp: u8, mant: u8) -> u8 {
+    (sign << 7) | ((exp & 0x1F) << 2) | (mant & 0x3)
+}
+
+#[test]
+fn float8_f8e5m2_classify_zero() {
+    // NVIDIA classify() bug (independent of saturation mode):
+    // classify() checks !is_normal() before is_nan(), so Zero
+    // (exp=0) returns Subnormal instead of Zero.
+    assert_eq!(F8E5M2::ZERO.classify(), FpCategory::Subnormal);
+    assert_eq!(F8E5M2::NEG_ZERO.classify(), FpCategory::Subnormal);
+    // microfloat correct: classify() returns Zero
+    assert_eq!(f8e5m2::ZERO.classify(), FpCategory::Zero);
+    assert_eq!(f8e5m2::NEG_ZERO.classify(), FpCategory::Zero);
+}
+
+#[test]
+fn float8_f8e5m2_classify_nan() {
+    // NVIDIA classify() bug: classify() checks !is_normal() before is_nan(),
+    // so NaN (exp=31) returns Subnormal instead of Nan.
+    assert_eq!(F8E5M2::NAN.classify(), FpCategory::Subnormal);
+    assert!(F8E5M2::NAN.is_nan());
+    assert!(!F8E5M2::NAN.is_finite());
+    assert!(!F8E5M2::NAN.is_normal());
+    // microfloat correct: classify() returns Nan
+    assert_eq!(f8e5m2::NAN.classify(), FpCategory::Nan);
+    assert!(f8e5m2::NAN.is_nan());
+}
+
+#[test]
+fn float8_f8e5m2_satfinite_infinity_constant() {
+    // NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7B = exp=30, mant=3 = 57344.0
+    // This is the overflow sentinel (not a true infinity).
+    // See cuda_fp8.hpp: FP8_MAXNORM = 0x7B; res = FP8_MAXNORM;
+    //
+    // IEEE 754 E5M2: INFINITY = exp=31, mant=0 = e5m2_bits(0, 31, 0)
+    assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
+    assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0);
+    assert!(F8E5M2::INFINITY.is_infinite());
+    let inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11110, 0b11));
+    assert!(inf.is_infinite());
+    assert_eq!(inf.to_f32(), 57344.0);
+    // microfloat (__NV_NOSAT / IEEE): INFINITY at e5m2_bits(0, 31, 0)
+    assert!(f8e5m2::has_inf());
+    assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00));
+    assert!(f8e5m2::INFINITY.is_infinite());
+    assert!(f8e5m2::NEG_INFINITY.is_infinite());
+}
+
+#[test]
+fn float8_f8e5m2_max_57344() {
+    // IEEE 754 E5M2: MAX = exp=30, mant=3 = e5m2_bits(0, 30, 3) = 57344.0
+    // __NV_SATFINITE: MAX = exp=30, mant=2 = e5m2_bits(0, 30, 2) = 49152.0
+    // The MAX constant is one mantissa bit less than IEEE max.
+    // float8 INFINITY is actually IEEE E5M2's max finite.
+    assert_eq!(F8E5M2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b10));
+    assert_eq!(F8E5M2::MAX.to_f32(), 49152.0);
+    assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
+    assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0);
+    // microfloat: MAX = 57344.0 is finite (__NV_NOSAT / IEEE compliant)
+    assert_eq!(f8e5m2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
+    assert_eq!(f8e5m2::MAX.to_f32(), 57344.0);
+    assert!(f8e5m2::MAX.is_finite());
+    assert!(!f8e5m2::MAX.is_infinite());
+}
+
+#[test]
+fn float8_f8e5m2_ieee_infinity_not_recognized() {
+    // IEEE 754 E5M2 infinity: exp=31, mant=0 = e5m2_bits(0, 31, 0)
+    // __NV_SATFINITE: uses e5m2_bits(0, 30, 3) for infinity, not e5m2_bits(0, 31, 0)
+    let ieee_inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11111, 0b00));
+    assert!(!ieee_inf.is_infinite());
+    // Yet it decodes to f32::INFINITY through the f16 conversion (inconsistent)
+    assert_eq!(ieee_inf.to_f32(), f32::INFINITY);
+    // microfloat (__NV_NOSAT / IEEE): IEEE infinity is recognized as infinite
+    let mf_inf = f8e5m2::from_bits(e5m2_bits(0b0, 0b11111, 0b00));
+    assert!(mf_inf.is_infinite());
+    assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00));
+}