Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ num-traits = ["dep:num-traits"]
rand_distr = ["dep:rand", "dep:rand_distr"]

[dev-dependencies]
float8 = "0.7.0"
proptest = { version = "1.11", default-features = false, features = ["std"] }

[target.'cfg(target_arch = "wasm32")'.dependencies]
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ This crate provides [`no_std`](https://rust-embedded.github.io/book/intro/no-std

See the [crate documentation](https://docs.rs/microfloat/) for more details.

## Related Crates

### `float8`

The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant.
They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)).
In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities.
In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow).

### Optional Features

- **`serde`** - Implement `Serialize` and `Deserialize` traits for the float
Expand Down
9 changes: 9 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@
//!
//! Compatibility with `ml-dtypes` is tested by generated fixtures in `tests/fixtures/`.
//! These fixtures validate conversions, classifications, arithmetic, and math methods.
//!
//! ## Related Crates
//!
//! ### `float8`
//!
//! The [`float8`](https://crates.io/crates/float8) crate provides `F8E4M3` and `F8E5M2` types that are not fully [OCP](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-12-01-pdf-1) compliant.
//! They use NVIDIA's `__NV_SATFINITE` saturation mode ([`cuda_fp8.hpp`](https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp)).
//! In this mode `INFINITY` constants are `FP8_MAXNORM` overflow sentinels rather than true infinities.
//! In contrast, microfloat uses `__NV_NOSAT` semantics (IEEE NaN/Inf on overflow).

mod bits;
mod format;
Expand Down
107 changes: 107 additions & 0 deletions tests/float8_e4m3_errant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#![allow(clippy::float_cmp)]
use float8::F8E4M3;
use microfloat::f8e4m3fn;
use std::num::FpCategory;

// ============================================================================
// float8 F8E4M3 behavior vs microfloat f8e4m3fn behavior
//
// float8::F8E4M3 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp.
//
// In NVIDIA's model, __nv_saturation_t controls overflow behavior:
// - __NV_SATFINITE: overflow clamps to max finite value
// - __NV_NOSAT: overflow produces IEEE NaN or Inf
//
// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity.
// microfloat implements __NV_NOSAT (IEEE / OCP compliant) instead.
//
// float8 also has classify() bug where Zero and NaN are considered Subnormal
// ============================================================================

// E4M3 bit layout: S.EEEE.MMM (1 + 4 + 3 = 8 bits)
const fn e4m3_bits(sign: u8, exp: u8, mant: u8) -> u8 {
(sign << 7) | ((exp & 0xF) << 3) | (mant & 0x7)
}

#[test]
fn float8_f8e4m3_classify_zero() {
// float8 errant: classify() returns Subnormal for Zero
assert_eq!(F8E4M3::ZERO.classify(), FpCategory::Subnormal);
assert_eq!(F8E4M3::NEG_ZERO.classify(), FpCategory::Subnormal);
// microfloat correct: classify() returns Zero
assert_eq!(f8e4m3fn::ZERO.classify(), FpCategory::Zero);
assert_eq!(f8e4m3fn::NEG_ZERO.classify(), FpCategory::Zero);
}

#[test]
fn float8_f8e4m3_classify_nan() {
// float8 errant: classify() returns Subnormal for NaN
assert_eq!(F8E4M3::NAN.classify(), FpCategory::Subnormal);
assert!(F8E4M3::NAN.is_nan());
assert!(!F8E4M3::NAN.is_finite());
assert!(!F8E4M3::NAN.is_normal());
// float8 classify() checks !is_normal() before is_nan()
// microfloat correct: classify() returns Nan
assert_eq!(f8e4m3fn::NAN.classify(), FpCategory::Nan);
assert!(f8e4m3fn::NAN.is_nan());
}

#[test]
fn float8_f8e4m3_satfinite_infinity_constant() {
// NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7E = exp=15, mant=6 = 448.0
// This is the overflow sentinel (not a true infinity).
// See cuda_fp8.hpp: FP8_MAXNORM = 0x7E; res = FP8_MAXNORM;
//
// IEEE 754 E4M3: INFINITY = exp=15, mant=0 = e4m3_bits(0, 15, 0)
// OCP E4M3: no infinity (N/A)
assert_eq!(F8E4M3::INFINITY.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
assert_eq!(F8E4M3::INFINITY.to_f32(), 448.0);
assert!(F8E4M3::INFINITY.is_infinite());
let inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b110));
assert!(inf.is_infinite());
assert_eq!(inf.to_f32(), 448.0);
// microfloat (__NV_NOSAT / OCP): no infinity, 448.0 is finite
assert!(!f8e4m3fn::has_inf());
let val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b110));
assert_eq!(val.to_f32(), 448.0);
assert!(val.is_finite());
assert!(!val.is_infinite());
}

#[test]
fn float8_f8e4m3_max_448() {
// __NV_SATFINITE: from_f32(448.0) clamps to FP8_MAXNORM = 0x7E
// which is the same as the "INFINITY" constant.
let val = F8E4M3::from_f32(448.0);
assert_eq!(val.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
assert_eq!(val.to_bits(), F8E4M3::INFINITY.to_bits());
assert!(val.is_infinite());
assert_eq!(val.to_f32(), 448.0);
// microfloat: MAX = 448.0 is finite (__NV_NOSAT / OCP compliant)
assert_eq!(f8e4m3fn::MAX.to_bits(), e4m3_bits(0b0, 0b1111, 0b110));
assert_eq!(f8e4m3fn::MAX.to_f32(), 448.0);
assert!(f8e4m3fn::MAX.is_finite());
assert!(!f8e4m3fn::MAX.is_infinite());
}

#[test]
fn float8_f8e4m3_overflow_to_nan() {
// __NV_NOSAT / OCP E4M3: overflow maps to NaN (no infinity)
assert!(f8e4m3fn::from_f32(500.0).to_f32().is_nan());
assert!(f8e4m3fn::from_f32(f32::INFINITY).to_f32().is_nan());
assert!(f8e4m3fn::from_f32(f32::NEG_INFINITY).to_f32().is_nan());
}

#[test]
fn float8_f8e4m3_ieee_infinity_not_recognized() {
// IEEE 754 E4M3 infinity: exp=15, mant=0 = e4m3_bits(0, 15, 0)
// Neither float8 (__NV_SATFINITE) nor microfloat (__NV_NOSAT) use
// e4m3_bits(0, 15, 0) as infinity.
let ieee_inf = F8E4M3::from_bits(e4m3_bits(0b0, 0b1111, 0b000));
assert!(!ieee_inf.is_infinite());
assert_eq!(ieee_inf.to_f32(), 256.0);
// microfloat: e4m3_bits(0, 15, 0) decodes as 256.0, which is correct
let mf_val = f8e4m3fn::from_bits(e4m3_bits(0b0, 0b1111, 0b000));
assert_eq!(mf_val.to_f32(), 256.0);
assert!(mf_val.is_finite());
}
100 changes: 100 additions & 0 deletions tests/float8_e5m2_errant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#![allow(clippy::float_cmp)]
use float8::F8E5M2;
use microfloat::f8e5m2;
use std::num::FpCategory;

// ============================================================================
// float8 F8E5M2 behavior vs microfloat f8e5m2 behavior
//
// float8::F8E5M2 uses NVIDIA __NV_SATFINITE saturation semantics, matching the behavior defined in https://gitlab.com/nvidia/headers/cuda-individual/cudart/-/raw/main/cuda_fp8.hpp.
//
// In NVIDIA's model, __nv_saturation_t controls overflow behavior:
// - __NV_SATFINITE: overflow clamps to max finite value
// - __NV_NOSAT: overflow produces IEEE NaN or Inf
//
// float8 implements __NV_SATFINITE, where "INFINITY" is the overflow sentinel (FP8_MAXNORM), not a true infinity.
// microfloat implements __NV_NOSAT (IEEE compliant) instead.
//
// float8 also has classify() bug where Zero and NaN are considered Subnormal
// ============================================================================

// E5M2 bit layout: S.EEEEE.MM (1 + 5 + 2 = 8 bits)
const fn e5m2_bits(sign: u8, exp: u8, mant: u8) -> u8 {
(sign << 7) | ((exp & 0x1F) << 2) | (mant & 0x3)
}

#[test]
fn float8_f8e5m2_classify_zero() {
// NVIDIA classify() bug (independent of saturation mode):
// classify() checks !is_normal() before is_nan(), so Zero
// (exp=0) returns Subnormal instead of Zero.
assert_eq!(F8E5M2::ZERO.classify(), FpCategory::Subnormal);
assert_eq!(F8E5M2::NEG_ZERO.classify(), FpCategory::Subnormal);
// microfloat correct: classify() returns Zero
assert_eq!(f8e5m2::ZERO.classify(), FpCategory::Zero);
assert_eq!(f8e5m2::NEG_ZERO.classify(), FpCategory::Zero);
}

#[test]
fn float8_f8e5m2_classify_nan() {
// NVIDIA classify() bug: classify() checks !is_normal() before is_nan(),
// so NaN (exp=31) returns Subnormal instead of Nan.
assert_eq!(F8E5M2::NAN.classify(), FpCategory::Subnormal);
assert!(F8E5M2::NAN.is_nan());
assert!(!F8E5M2::NAN.is_finite());
assert!(!F8E5M2::NAN.is_normal());
// microfloat correct: classify() returns Nan
assert_eq!(f8e5m2::NAN.classify(), FpCategory::Nan);
assert!(f8e5m2::NAN.is_nan());
}

#[test]
fn float8_f8e5m2_satfinite_infinity_constant() {
// NVIDIA __NV_SATFINITE: FP8_MAXNORM = 0x7B = exp=30, mant=3 = 57344.0
// This is the overflow sentinel (not a true infinity).
// See cuda_fp8.hpp: FP8_MAXNORM = 0x7B; res = FP8_MAXNORM;
//
// IEEE 754 E5M2: INFINITY = exp=31, mant=0 = e5m2_bits(0, 31, 0)
assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0);
assert!(F8E5M2::INFINITY.is_infinite());
let inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11110, 0b11));
assert!(inf.is_infinite());
assert_eq!(inf.to_f32(), 57344.0);
// microfloat (__NV_NOSAT / IEEE): INFINITY at e5m2_bits(0, 31, 0)
assert!(f8e5m2::has_inf());
assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00));
assert!(f8e5m2::INFINITY.is_infinite());
assert!(f8e5m2::NEG_INFINITY.is_infinite());
}

#[test]
fn float8_f8e5m2_max_57344() {
// IEEE 754 E5M2: MAX = exp=30, mant=3 = e5m2_bits(0, 30, 3) = 57344.0
// __NV_SATFINITE: MAX = exp=30, mant=2 = e5m2_bits(0, 30, 2) = 49152.0
// The MAX constant is one mantissa bit less than IEEE max.
// float8 INFINITY is actually IEEE E5M2's max finite.
assert_eq!(F8E5M2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b10));
assert_eq!(F8E5M2::MAX.to_f32(), 49152.0);
assert_eq!(F8E5M2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
assert_eq!(F8E5M2::INFINITY.to_f32(), 57344.0);
// microfloat: MAX = 57344.0 is finite (__NV_NOSAT / IEEE compliant)
assert_eq!(f8e5m2::MAX.to_bits(), e5m2_bits(0b0, 0b11110, 0b11));
assert_eq!(f8e5m2::MAX.to_f32(), 57344.0);
assert!(f8e5m2::MAX.is_finite());
assert!(!f8e5m2::MAX.is_infinite());
}

#[test]
fn float8_f8e5m2_ieee_infinity_not_recognized() {
// IEEE 754 E5M2 infinity: exp=31, mant=0 = e5m2_bits(0, 31, 0)
// __NV_SATFINITE: uses e5m2_bits(0, 30, 3) for infinity, not e5m2_bits(0, 31, 0)
let ieee_inf = F8E5M2::from_bits(e5m2_bits(0b0, 0b11111, 0b00));
assert!(!ieee_inf.is_infinite());
// Yet it decodes to f32::INFINITY through the f16 conversion (inconsistent)
assert_eq!(ieee_inf.to_f32(), f32::INFINITY);
// microfloat (__NV_NOSAT / IEEE): IEEE infinity is recognized as infinite
let mf_inf = f8e5m2::from_bits(e5m2_bits(0b0, 0b11111, 0b00));
assert!(mf_inf.is_infinite());
assert_eq!(f8e5m2::INFINITY.to_bits(), e5m2_bits(0b0, 0b11111, 0b00));
}