From 888373bcb6b0c145b05e3912337f3638ae3e4f9c Mon Sep 17 00:00:00 2001
From: koreaygj <ygj1828@gmail.com>
Date: Mon, 1 Dec 2025 18:54:58 +0900
Subject: [PATCH 1/2] feat: Add global fast_math flag to CudaBuilder

- Add fast_math field to CudaBuilder struct
- Enables ftz, fast_sqrt, fast_div and fma_contraction(fmad) internally
- Provides convenient parity with NVCC's --use_fast_math
---
 crates/cuda_builder/src/lib.rs | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index f54775c9..4616a8e0 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -148,6 +148,10 @@ pub struct CudaBuilder {
     /// Enable FMA (fused multiply-add) contraction.
     /// `true` by default.
     pub fma_contraction: bool,
+    /// Enable fast math approximations globally (equivalent to NVCC's --use_fast_math).
+    /// This implies ftz=true, prec-div=false, prec-sqrt=false, and fmad=true.
+    /// `false` by default.
+    pub fast_math: bool,
     /// Whether to emit a certain IR. Emitting LLVM IR is useful to debug any codegen
     /// issues. If you are submitting a bug report try to include the LLVM IR file of
     /// the program that contains the offending function.
@@ -206,6 +210,7 @@ impl CudaBuilder {
             nvvm_opts: true,
             arch: NvvmArch::default(),
             ftz: false,
+            fast_math: false,
             fast_sqrt: false,
             fast_div: false,
             fma_contraction: true,
@@ -266,6 +271,19 @@ impl CudaBuilder {
         self
     }
 
+    /// Enable fast math approximations globally (equivalent to NVCC's --use_fast_math).
+    /// This implies ftz=true, prec-div=false, prec-sqrt=false, and fmad=true.
+    pub fn fast_math(mut self, fast_math: bool) -> Self {
+        self.fast_math = fast_math;
+        if fast_math {
+            self.ftz = true;
+            self.fast_sqrt = true;
+            self.fast_div = true;
+            self.fma_contraction = true;
+        }
+        self
+    }
+
     /// Use a fast approximation for single-precision floating point square root.
     pub fn fast_sqrt(mut self, fast_sqrt: bool) -> Self {
         self.fast_sqrt = fast_sqrt;
@@ -725,6 +743,10 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {
         llvm_args.push("-ftz=1".to_string());
     }
 
+    if builder.fast_math {
+        llvm_args.push("--use_fast_math".to_string());
+    }
+
     if builder.fast_sqrt {
         llvm_args.push("-prec-sqrt=0".to_string());
     }

From 02f3a46d56e66daf3b6d7166b3f1a5571b463049 Mon Sep 17 00:00:00 2001
From: koreaygj <ygj1828@gmail.com>
Date: Wed, 11 Mar 2026 19:41:51 +0900
Subject: [PATCH 2/2] refactor: fast_math flag implements

---
 crates/cuda_builder/src/lib.rs | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index 9272dda7..3620c50a 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -148,10 +148,6 @@ pub struct CudaBuilder {
     /// Enable FMA (fused multiply-add) contraction.
     /// `true` by default.
     pub fma_contraction: bool,
-    /// Enable fast math approximations globally (equivalent to NVCC's --use_fast_math).
-    /// This implies ftz=true, prec-div=false, prec-sqrt=false, and fmad=true.
-    /// `false` by default.
-    pub fast_math: bool,
     /// Whether to emit a certain IR. Emitting LLVM IR is useful to debug any codegen
     /// issues. If you are submitting a bug report try to include the LLVM IR file of
     /// the program that contains the offending function.
@@ -210,7 +206,6 @@ impl CudaBuilder {
             nvvm_opts: true,
             arch: NvvmArch::default(),
             ftz: false,
-            fast_math: false,
             fast_sqrt: false,
             fast_div: false,
             fma_contraction: true,
@@ -271,16 +266,18 @@ impl CudaBuilder {
         self
     }
 
-    /// Enable fast math approximations globally (equivalent to NVCC's --use_fast_math).
-    /// This implies ftz=true, prec-div=false, prec-sqrt=false, and fmad=true.
-    pub fn fast_math(mut self, fast_math: bool) -> Self {
-        self.fast_math = fast_math;
-        if fast_math {
-            self.ftz = true;
-            self.fast_sqrt = true;
-            self.fast_div = true;
-            self.fma_contraction = true;
-        }
+    /// Enable fast math approximations globally (equivalent to NVCC's `--use_fast_math`).
+    /// Sets `ftz=true`, `fast_sqrt=true`, `fast_div=true`, and `fma_contraction=true`.
+    /// Individual flags can still be overridden afterward.
+    ///
+    /// Note: this sacrifices IEEE 754 compliance for performance. Single-precision
+    /// division and square root will have up to 2 ULP error, and denormal values
+    /// will be flushed to zero.
+    pub fn fast_math(mut self) -> Self {
+        self.ftz = true;
+        self.fast_sqrt = true;
+        self.fast_div = true;
+        self.fma_contraction = true;
         self
     }
 
@@ -741,10 +738,6 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {
         llvm_args.push("-ftz=1".to_string());
     }
 
-    if builder.fast_math {
-        llvm_args.push("--use_fast_math".to_string());
-    }
-
     if builder.fast_sqrt {
         llvm_args.push("-prec-sqrt=0".to_string());
     }