diff --git a/Cargo.lock b/Cargo.lock index 7c54561..533d5df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,12 +75,27 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "equivalent" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "float8" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2d1f04709a8ac06e8e8042875a3c466cc4832d3c1a18dbcb9dba3c6e83046bc" +dependencies = [ + "half", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -95,6 +110,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.17.0" @@ -138,6 +164,7 @@ name = "microfloat" version = "0.1.1" dependencies = [ "bytemuck", + "float8", "getrandom", "libm", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 86e77cc..721fde4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ num-traits = ["dep:num-traits"] rand_distr = ["dep:rand", "dep:rand_distr"] [dev-dependencies] +float8 = "0.7.0" proptest = { version = "1.11", default-features = false, features = ["std"] } [target.'cfg(target_arch = "wasm32")'.dependencies] diff --git a/README.md b/README.md index 14dfe8b..3969d48 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,15 @@ This crate provides [`no_std`](https://rust-embedded.github.io/book/intro/no-std See the [crate documentation](https://docs.rs/microfloat/) for more details. +## Related Crates + +### `float8` + +The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant. +They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)). +In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities. +In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow). + ### Optional Features - **`serde`** - Implement `Serialize` and `Deserialize` traits for the float diff --git a/src/lib.rs b/src/lib.rs index cbf2231..4644b8e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,6 +79,15 @@ //! //! Compatibility with `ml-dtypes` is tested by generated fixtures in `tests/fixtures/`. //! These fixtures validate conversions, classifications, arithmetic, and math methods. +//! +//! ## Related Crates +//! +//! ### `float8` +//! +//! The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant. +//! They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)). +//! In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities. +//! In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow). mod bits; mod format; diff --git a/tests/float8_e4m3_errant.rs b/tests/float8_e4m3_errant.rs new file mode 100644 index 0000000..ed6b3d4 --- /dev/null +++ b/tests/float8_e4m3_errant.rs @@ -0,0 +1,107 @@ +#![allow(clippy::float_cmp)] +use float8::F8E4M3; +use microfloat::f8e4m3fn; +use std::num::FpCategory; + +// ============================================================================ +// float8 F8E4M3 behavior vs microfloat f8e4m3fn behavior +// +// float8::F8E4M3 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp. +// +// In NVIDIA's model, __nv_saturation_t controls overflow behavior: +// - __NV_SATFINITE: overflow clamps to max finite value +// - __NV_NOSAT: overflow produces IEEE NaN or Inf +// +// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity. +// microfloat implements __NV_NOSAT (IEEE / OCP compliant) instead. +// +// float8 also has classify() bug where Zero and NaN are considered Subnormal +// ============================================================================ + +// E4M3 bit layout: S.EEEE.MMM (1 + 4 + 3 = 8 bits) +const fn e4m3_bits(sign: u8, exp: u8, mant: u8) -> u8 { + (sign << 7) | ((exp & 0xF) << 3) | (mant & 0x7) +} + +#[test] +fn float8_f8e4m3_classify_zero() { + // float8 errant: classify() returns Subnormal for Zero + assert_eq!(F8E4M3::ZERO.classify(), FpCategory::Subnormal); + assert_eq!(F8E4M3::NEG_ZERO.classify(), FpCategory::Subnormal); + // microfloat correct: classify() returns Zero + assert_eq!(f8e4m3fn::ZERO.classify(), FpCategory::Zero); + assert_eq!(f8e4m3fn::NEG_ZERO.classify(), FpCategory::Zero); +} + +#[test] +fn float8_f8e4m3_classify_nan() { + // float8 errant: classify() returns Subnormal for NaN + assert_eq!(F8E4M3::NAN.classify(), FpCategory::Subnormal); + assert!(F8E4M3::NAN.is_nan()); + assert!(!F8E4M3::NAN.is_finite()); + assert!(!F8E4M3::NAN.is_normal()); + // float8 classify() checks !is_normal() before is_nan() + // microfloat correct: classify() returns Nan + assert_eq!(f8e4m3fn::NAN.classify(), FpCategory::Nan); + assert!(f8e4m3fn::NAN.is_nan()); +} + +#[test] +fn float8_f8e4m3_satfinite_infinity_constant() { + // NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7E = exp=15, mant=6 = 448.0 + // This is the overflow sentinel (not a true infinity). + // See cuda_fp8.hpp: FP8_MAXNORM = 0x7E; res = FP8_MAXNORM; + // + // IEEE 754 E4M3: INFINITY = exp=15, mant=0 = e4m3_bits(0, 15, 0) + // OCP E4M3: no infinity (N/A) + assert_eq!(F8E4M3::INFINITY.to_bits(), e4m3_bits(0b0, 0b1111, 0b110)); + assert_eq!(F8E4M3::INFINITY.to_f32(), 448.0); + assert!(F8E4M3::INFINITY.is_infinite()); + let inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b110)); + assert!(inf.is_infinite()); + assert_eq!(inf.to_f32(), 448.0); + // microfloat (__NV_NOSAT / OCP): no infinity, 448.0 is finite + assert!(!f8e4m3fn::has_inf()); + let val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b110)); + assert_eq!(val.to_f32(), 448.0); + assert!(val.is_finite()); + assert!(!val.is_infinite()); +} + +#[test] +fn float8_f8e4m3_max_448() { + // __NV_SATFINITE: from_f32(448.0) clamps to FP8_MAXNORM = 0x7E + // which is the same as the "INFINITY" constant. + let val = F8E4M3::from_f32(448.0); + assert_eq!(val.to_bits(), e4m3_bits(0b0, 0b1111, 0b110)); + assert_eq!(val.to_bits(), F8E4M3::INFINITY.to_bits()); + assert!(val.is_infinite()); + assert_eq!(val.to_f32(), 448.0); + // microfloat: MAX = 448.0 is finite (__NV_NOSAT / OCP compliant) + assert_eq!(f8e4m3fn::MAX.to_bits(), e4m3_bits(0b0, 0b1111, 0b110)); + assert_eq!(f8e4m3fn::MAX.to_f32(), 448.0); + assert!(f8e4m3fn::MAX.is_finite()); + assert!(!f8e4m3fn::MAX.is_infinite()); +} + +#[test] +fn float8_f8e4m3_overflow_to_nan() { + // __NV_NOSAT / OCP E4M3: overflow maps to NaN (no infinity) + assert!(f8e4m3fn::from_f32(500.0).to_f32().is_nan()); + assert!(f8e4m3fn::from_f32(f32::INFINITY).to_f32().is_nan()); + assert!(f8e4m3fn::from_f32(f32::NEG_INFINITY).to_f32().is_nan()); +} + +#[test] +fn float8_f8e4m3_ieee_infinity_not_recognized() { + // IEEE 754 E4M3 infinity: exp=15, mant=0 = e4m3_bits(0, 15, 0) + // Neither float8 (__NV_SATFINITE) nor microfloat (__NV_NOSAT) use + // e4m3_bits(0, 15, 0) as infinity. + let ieee_inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b000)); + assert!(!ieee_inf.is_infinite()); + assert_eq!(ieee_inf.to_f32(), 256.0); + // microfloat: e4m3_bits(0, 15, 0) decodes as 256.0, which is correct + let mf_val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b000)); + assert_eq!(mf_val.to_f32(), 256.0); + assert!(mf_val.is_finite()); +} diff --git a/tests/float8_e5m2_errant.rs b/tests/float8_e5m2_errant.rs new file mode 100644 index 0000000..cee5458 --- /dev/null +++ b/tests/float8_e5m2_errant.rs @@ -0,0 +1,100 @@ +#![allow(clippy::float_cmp)] +use float8::F8E5M2; +use microfloat::f8e5m2; +use std::num::FpCategory; + +// ============================================================================ +// float8 F8E5M2 behavior vs microfloat f8e5m2 behavior +// +// float8::F8E5M2 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp. +// +// In NVIDIA's model, __nv_saturation_t controls overflow behavior: +// - __NV_SATFINITE: overflow clamps to max finite value +// - __NV_NOSAT: overflow produces IEEE NaN or Inf +// +// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity. +// microfloat implements __NV_NOSAT (IEEE compliant) instead. +// +// float8 also has classify() bug where Zero and NaN are considered Subnormal +// ============================================================================ + +// E5M2 bit layout: S.EEEEE.MM (1 + 5 + 2 = 8 bits) +const fn e5m2_bits(sign: u8, exp: u8, mant: u8) -> u8 { + (sign << 7) | ((exp & 0x1F) << 2) | (mant & 0x3) +} + +#[test] +fn float8_f8e5m2_classify_zero() { + // NVIDIA classify() bug (independent of saturation mode): + // classify() checks !is_normal() before is_nan(), so Zero + // (exp=0) returns Subnormal instead of Zero. + assert_eq!(F8E5M2::ZERO.classify(), FpCategory::Subnormal); + assert_eq!(F8E5M2::NEG_ZERO.classify(), FpCategory::Subnormal); + // microfloat correct: classify() returns Zero + assert_eq!(f8e5m2::ZERO.classify(), FpCategory::Zero); + assert_eq!(f8e5m2::NEG_ZERO.classify(), FpCategory::Zero); +} + +#[test] +fn float8_f8e5m2_classify_nan() { + // NVIDIA classify() bug: classify() checks !is_normal() before is_nan(), + // so NaN (exp=31) returns Subnormal instead of Nan. + assert_eq!(F8E5M2::NAN.classify(), FpCategory::Subnormal); + assert!(F8E5M2::NAN.is_nan()); + assert!(!F8E5M2::NAN.is_finite()); + assert!(!F8E5M2::NAN.is_normal()); + // microfloat correct: classify() returns Nan + assert_eq!(f8e5m2::NAN.classify(), FpCategory::Nan); + assert!(f8e5m2::NAN.is_nan()); +} + +#[test] +fn float8_f8e5m2_satfinite_infinity_constant() { + // NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7B = exp=30, mant=3 = 57344.0 + // This is the overflow sentinel (not a true infinity). + // See cuda_fp8.hpp: FP8_MAXNORM = 0x7B; res = FP8_MAXNORM; + // + // IEEE 754 E5M2: INFINITY = exp=31, mant=0 = e5m2_bits(0, 31, 0) + assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11)); + assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0); + assert!(F8E5M2::INFINITY.is_infinite()); + let inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11110, 0b11)); + assert!(inf.is_infinite()); + assert_eq!(inf.to_f32(), 57344.0); + // microfloat (__NV_NOSAT / IEEE): INFINITY at e5m2_bits(0, 31, 0) + assert!(f8e5m2::has_inf()); + assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00)); + assert!(f8e5m2::INFINITY.is_infinite()); + assert!(f8e5m2::NEG_INFINITY.is_infinite()); +} + +#[test] +fn float8_f8e5m2_max_57344() { + // IEEE 754 E5M2: MAX = exp=30, mant=3 = e5m2_bits(0, 30, 3) = 57344.0 + // __NV_SATFINITE: MAX = exp=30, mant=2 = e5m2_bits(0, 30, 2) = 49152.0 + // The MAX constant is one mantissa bit less than IEEE max. + // float8 INFINITY is actually IEEE E5M2's max finite. + assert_eq!(F8E5M2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b10)); + assert_eq!(F8E5M2::MAX.to_f32(), 49152.0); + assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11)); + assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0); + // microfloat: MAX = 57344.0 is finite (__NV_NOSAT / IEEE compliant) + assert_eq!(f8e5m2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b11)); + assert_eq!(f8e5m2::MAX.to_f32(), 57344.0); + assert!(f8e5m2::MAX.is_finite()); + assert!(!f8e5m2::MAX.is_infinite()); +} + +#[test] +fn float8_f8e5m2_ieee_infinity_not_recognized() { + // IEEE 754 E5M2 infinity: exp=31, mant=0 = e5m2_bits(0, 31, 0) + // __NV_SATFINITE: uses e5m2_bits(0, 30, 3) for infinity, not e5m2_bits(0, 31, 0) + let ieee_inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11111, 0b00)); + assert!(!ieee_inf.is_infinite()); + // Yet it decodes to f32::INFINITY through the f16 conversion (inconsistent) + assert_eq!(ieee_inf.to_f32(), f32::INFINITY); + // microfloat (__NV_NOSAT / IEEE): IEEE infinity is recognized as infinite + let mf_inf = f8e5m2::from_bits(e5m2_bits(0b0, 0b11111, 0b00)); + assert!(mf_inf.is_infinite()); + assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00)); +}