From 36605ab3993c82ca5edaa39c73c135a1c345bff6 Mon Sep 17 00:00:00 2001
From: Renar Narubin <renar@standard.ai>
Date: Mon, 20 Dec 2021 21:47:30 -0800
Subject: [PATCH] Improve readme, ditch features, don't impl Float

---
 Cargo.toml            | 16 --------
 README.md             | 84 +++++++++++++++++++++++++++++----------
 benches/math.rs       | 38 ------------------
 benches/operations.rs | 18 ++++-----
 build.rs              | 15 +------
 src/lib.rs            | 91 +++++++++++++++++++------------------------
 src/math/mod.rs       | 17 +++++---
 src/num_traits.rs     | 65 +------------------------------
 8 files changed, 126 insertions(+), 218 deletions(-)
 delete mode 100644 benches/math.rs

diff --git a/Cargo.toml b/Cargo.toml
index cf2d9ed..37b4422 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,25 +13,9 @@ bench = false
 name = "operations"
 harness = false
 
-[[bench]]
-name = "math"
-harness = false
-
 [features]
 default = ["num-traits"]
 
-# disable-able fast-math features
-no-finite-math-only = []
-no-associative-math = []
-no-reciprocal-math = []
-signed-zeros = []
-trapping-math = []
-fp-contract-on = []
-no-approx-func = []
-math-errno = []
-
-# TODO denormal-fp-math? can have cpu-wide consequences
-
 # optional trait implementations
 nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
 nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
diff --git a/README.md b/README.md
index 8fd7ad9..32f38a5 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Fast Floating-Point Math
 
-`fast_fp` provides a set of primitive types that support [fast-math]
+`fast_fp` provides a set of primitive types that support [fast-math] compiler
 optimizations for many operations. These optimizations allow the compiler to
 potentially generate faster code by relaxing some of the requirements of [IEEE
 754] floating-point arithmetic.
@@ -12,29 +12,71 @@ is lost in the overall computation. Note that there are also cases where
 fast-math optimizations can _improve_ precision, such as contracting separate
 multiplication and addition into a fused multiply-add operation.
 
-## Limitations
+## Caveats
 
-In order to enable these optimizations safely, certain requirements must be
-observed:
+### Performance
+Use of this crate's primitives may not be faster than the standard primitives
+in all cases. That may be because the generated code is slower in practice, or
+because of certain measures taken by this crate to prevent Undefined Behavior
+(in particular for comparison heavy code). Users should carefully measure and
+benchmark their code to understand whether they actually benefit from use of
+these types.
 
-- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an
-	operation are, or the results of an operation _would_ be, `+inf`, `-inf`,
-	or `NaN`, then the operation's result value is unspecified. This crate goes
-	to lengths to ensure that such an operation is not Undefined Behavior in the
-	strict sense, but the output is free to be any representable value of the
-	output type, and may not be a fixed value at all.
-- Use of this crate's primitives may not be faster than the standard primitives
-	in all cases. That may be because the generated code is slower in practice,
-	or because of certain measures taken by this crate to prevent UB (in
-	particular for comparison heavy code). Users should carefully measure and
-	benchmark their code to understand whether they actually benefit from use of
-	these types.
-- The safety of this crate is only assessed against rustc's LLVM code
-	generation. This crate should not be used with alternative code generators
-	such as cranelift or GCC
-- Signed-ness of zeros may be treated as insignificant and not preserved
+### Finite Math
+By default, the `finite-math-only` optimization flag is enabled. With this
+enabled, the user must ensure that operations on the fast types **do not**
+involve infinite or NaN values. If the arguments to an operation are, or the
+results of an operation _would_ be, `+inf`, `-inf`, or `NaN`, then the
+operation's result value is unspecified. This crate goes to lengths to ensure
+that such an operation is not UB in the strict sense, but the output is free to
+be any representable value of the output type, and may not be a fixed value at
+all.
 
-[TODO]: # (is there a way to detect the code generator at build time?)
+### Building
+`fast_fp` enables fast-math optimizations by calling C code which was compiled
+with these optimizations enabled; additionally, some LLVM IR is used to prevent
+triggering UB that is otherwise possible with these optimizations. As a
+consequence, building this crate requires `clang` to be installed _and_
+requires the final binary to be linked using cross-language LTO to achieve the
+performance benefits.
+
+This LTO requires a version of clang compatible with the LLVM version used by
+rustc. To find the necessary LLVM version, check rustc's version info in
+verbose mode:
+
+```shell
+$ rustc -vV
+rustc 1.56.0 (09c42c458 2021-10-18)
+binary: rustc
+commit-hash: 09c42c45858d5f3aedfa670698275303a3d19afa
+commit-date: 2021-10-18
+host: x86_64-unknown-linux-gnu
+release: 1.56.0
+LLVM version: 13.0.0 # <--- see the version here
+```
+
+Then build and link using a `clang` and `lld` with the corresponding version:
+
+```shell
+$ CC="clang-13" \
+RUSTFLAGS="-Clinker-plugin-lto -Clinker=clang-13 -Clink-arg=-fuse-ld=lld-13" \
+cargo build
+```
+
+For simplicity, these arguments can be stored in a [cargo config] file
+
+```toml
+[env]
+CC = "clang-13"
+
+[build]
+rustflags = ["-Clinker-plugin-lto", "-Clinker=clang-13", "-Clink-arg=-fuse-ld=lld-13"]
+```
+
+Although rustc does not always use an official LLVM release version, it's
+typically close enough to be interoperable with the official clang and LLVM
+releases of the same version number.
 
 [fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags
 [IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754
+[cargo config]: https://doc.rust-lang.org/cargo/reference/config.html
diff --git a/benches/math.rs b/benches/math.rs
deleted file mode 100644
index 9dbc7a1..0000000
--- a/benches/math.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
-use fast_fp::{ff32, FF32};
-use rand::{distributions::Standard, thread_rng, Rng};
-
-fn min(c: &mut Criterion) {
-    let mut group = c.benchmark_group("min");
-    for count in [2, 8, 32, 1024] {
-        group.throughput(Throughput::Elements(count as u64));
-
-        let f32_vals = thread_rng()
-            .sample_iter(Standard)
-            .take(count)
-            .collect::<Vec<f32>>();
-
-        // use the same values for both benchmarks
-        let ff32_vals = f32_vals
-            .clone()
-            .into_iter()
-            .map(ff32)
-            .collect::<Vec<FF32>>();
-
-        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(f32::MAX, |acc, val| acc.min(val)));
-        });
-
-        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
-            b.iter(|| {
-                vals.iter()
-                    .copied()
-                    .fold(FF32::MAX, |acc, val| acc.min(val))
-            });
-        });
-    }
-    group.finish();
-}
-
-criterion_group!(benches, min);
-criterion_main!(benches);
diff --git a/benches/operations.rs b/benches/operations.rs
index 32b6b92..3002682 100644
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -3,12 +3,12 @@ use criterion::{
     BenchmarkId, Criterion, Throughput,
 };
 use fast_fp::{ff32, ff64, FF32, FF64};
+use ops::{Add, Div, Mul};
 use rand::{
     distributions::{self, Distribution},
     rngs::StdRng,
     Rng, SeedableRng,
 };
-use std::ops::{Add, Div, Mul};
 
 fn add(c: &mut Criterion) {
     let mut group = c.benchmark_group("add");
@@ -18,9 +18,9 @@ fn add(c: &mut Criterion) {
     let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
 
     // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::add, 0.0, rng.clone(), f32s);
     fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::add, 0.0, rng.clone(), f64s);
     fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
 }
 
@@ -34,9 +34,9 @@ fn mul(c: &mut Criterion) {
     let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
 
     // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::mul, 0.0, rng.clone(), f32s);
     fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::mul, 0.0, rng.clone(), f64s);
     fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
 }
 
@@ -50,9 +50,9 @@ fn div(c: &mut Criterion) {
     let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
 
     // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::div, 0.0, rng.clone(), f32s);
     fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::div, 0.0, rng.clone(), f64s);
     fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
 }
 
@@ -64,9 +64,9 @@ fn min(c: &mut Criterion) {
     let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
 
     // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::min, 0.0, rng.clone(), f32s);
     fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::min, 0.0, rng.clone(), f64s);
     fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
 }
 
diff --git a/build.rs b/build.rs
index c2fb8c3..532c48b 100644
--- a/build.rs
+++ b/build.rs
@@ -24,34 +24,21 @@ fn build_ll(mut builder: cc::Build) {
 fn build_c(mut builder: cc::Build) {
     builder.opt_level(3);
 
-    #[cfg(not(feature = "no-associative-math"))]
+    // TODO control flags with generics
     builder.flag("-fassociative-math");
-
-    #[cfg(not(feature = "no-reciprocal-math"))]
     builder.flag("-freciprocal-math");
-
-    #[cfg(not(feature = "signed-zeros"))]
     builder.flag("-fno-signed-zeros");
-
-    #[cfg(not(feature = "trapping-math"))]
     builder.flag("-fno-trapping-math");
-
-    #[cfg(not(feature = "fp-contract-on"))]
     builder.flag("-ffp-contract=fast");
-
     // -fapprox-func isn't currently available in the driver, but it is in clang itself
     // https://reviews.llvm.org/D106191
-    #[cfg(not(feature = "no-approx-func"))]
     builder.flag("-Xclang").flag("-fapprox-func");
-
-    #[cfg(not(feature = "math-errno"))]
     builder.flag("-fno-math-errno");
 
     // poison_unsafe must be compiled without finite-math-only
     // see its docs for details
     poison_unsafe(builder.clone());
 
-    #[cfg(not(feature = "no-finite-math-only"))]
     builder.flag("-ffinite-math-only");
 
     poison_safe(builder);
diff --git a/src/lib.rs b/src/lib.rs
index 8737bc0..c08a582 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,26 +45,37 @@ mod num_traits;
 mod poison;
 use poison::MaybePoison;
 
-/// The error returned by the checked constructors of [`FF32`] and [`FF64`]
-#[derive(Clone, Debug, PartialEq)]
-pub struct InvalidValueError {
-    _priv: (),
-}
-
-impl fmt::Display for InvalidValueError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.write_str("value may not be infinite or NaN")
-    }
-}
-
-impl std::error::Error for InvalidValueError {}
-
 // The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified
 // values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison
 // values have a relatively consistent behavior (stuff like transitivity), defined cases for UB,
 // and importantly can be limited in scope by freezing to a fixed value.
 //
-// FIXME more docs
+// This library manages these poison values to prevent UB. On the rust side, potentially-poison
+// values are stored in a `MaybePoison` type, similar to the std's `MaybeUninit`. This helps ensure
+// that the values would not trigger UB based on rust's semantics (for example, avoiding questions
+// of whether all bit patterns of a primitive are valid). On the C side, operations are split into
+// two groups: poison "safe" and poison "unsafe". Poison safe operations are ones which can accept
+// any input value without triggering any UB. The operation may produce a poison value, for example
+// `1.f / 0.f` with finite-math-only enabled, but not UB. Poison unsafe operations are ones which
+// could trigger UB for some input value(s). These two definitions follow LLVM's documentation on
+// poison, which explains poison can be relaxed to any value for a type, including `undef`.
+// Therefore, if poison is passed to an operation it could be relaxed to any value; if some value
+// could trigger UB, then so can poison.
+//
+// Poison safe operations are called with input values normally. They don't produce UB, so it's
+// safe to call no matter the input. The operation is assumed to potentially produce poison itself,
+// so the output is always wrapped in a `MaybePoison`.
+//
+// Poison unsafe operations must take certain precautions. First, any input arguments that are
+// `MaybePoison` are frozen using LLVM's `freeze` instruction. This produces a value with an
+// unspecified, but fixed, value which now won't be relaxed any further. Additionally, these
+// operations are compiled without any flags that potentially introduce poison, regardless of
+// enabled crate features. This ensures that the operation internally should not produce any poison
+// regardless of input value. These two steps together preclude any poison values, which should
+// prevent UB (assuming the operation was safe to call in the first place).
+//
+// All operations in rust are considered poison unsafe, and therefore must always freeze the value
+// before using it. Freezing produces a regular f32/f64
 //
 // Prior art and references
 //
@@ -87,6 +98,20 @@ impl std::error::Error for InvalidValueError {}
 // https://github.com/rust-lang/unsafe-code-guidelines/issues/71
 // notes on the validity of primitive bit patterns
 
+/// The error returned by the checked constructors of [`FF32`] and [`FF64`]
+#[derive(Clone, Debug, PartialEq)]
+pub struct InvalidValueError {
+    _priv: (),
+}
+
+impl fmt::Display for InvalidValueError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str("value may not be infinite or NaN")
+    }
+}
+
+impl std::error::Error for InvalidValueError {}
+
 /// A wrapper over `f32` which enables some fast-math optimizations.
 // TODO how best to document unspecified values, including witnessing possibly varying values
 #[derive(Clone, Copy)]
@@ -424,42 +449,6 @@ macro_rules! impls {
             }
         }
 
-        // FIXME feature conditional Eq/Ord
-        impl Eq for $fast_ty {}
-
-        impl Ord for $fast_ty {
-            #[inline(always)]
-            fn cmp(&self, other: &$fast_ty) -> cmp::Ordering {
-                let this = self.freeze_raw();
-                let that = other.freeze_raw();
-
-                // Note NaNs are not supported (and would break everything else anyway) so we ignore them
-                // and implement full Ord
-                if this < that {
-                    cmp::Ordering::Less
-                } else if this > that {
-                    cmp::Ordering::Greater
-                } else {
-                    cmp::Ordering::Equal
-                }
-            }
-
-            #[inline]
-            fn min(self, other: $fast_ty) -> $fast_ty {
-                <$fast_ty>::min(self, other)
-            }
-
-            #[inline]
-            fn max(self, other: $fast_ty) -> $fast_ty {
-                <$fast_ty>::max(self, other)
-            }
-
-            #[inline]
-            fn clamp(self, min: $fast_ty, max: $fast_ty) -> $fast_ty {
-                <$fast_ty>::clamp(self, min, max)
-            }
-        }
-
         impl From<$fast_ty> for $base_ty {
             #[inline(always)]
             fn from(from: $fast_ty) -> Self {
diff --git a/src/math/mod.rs b/src/math/mod.rs
index e044059..47c40fb 100644
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -80,14 +80,16 @@ macro_rules! poison_safe_fns {
                     // primitive type, we can pass them directly over FFI
                     fn [<$fn _ $base_ty>](a: $fast_ty $(, $arg: $fast_ty)*) -> $fast_ty;
                 }
+            )*
 
-                impl $fast_ty {
+            impl $fast_ty {
+                $(
                     #[inline]
                     pub fn $fn(self $(, $arg: Self)*) -> Self {
                         unsafe { [<$fn _ $base_ty>](self $(, $arg)*) }
                     }
-                }
-            )*
+                )*
+            }
         }
     }
 }
@@ -104,13 +106,16 @@ macro_rules! poison_unsafe_fns {
                     fn [<$fn _ $base_ty>](a: $base_ty $(, $arg: $base_ty)*) -> $fast_ty;
                 }
 
-                impl $fast_ty {
+            )*
+
+            impl $fast_ty {
+                $(
                     #[inline]
                     pub fn $fn(self $(, $arg: Self)*) -> Self {
                         unsafe { [<$fn _ $base_ty>](self.freeze_raw() $(, $arg.freeze_raw())*) }
                     }
-                }
-            )*
+                )*
+            }
         }
     }
 }
diff --git a/src/num_traits.rs b/src/num_traits.rs
index 5cfa6c8..2c0081e 100644
--- a/src/num_traits.rs
+++ b/src/num_traits.rs
@@ -1,7 +1,7 @@
 #![cfg(feature = "num-traits")]
 #![cfg_attr(docsrs, doc(cfg(feature = "num-traits")))]
+
 use crate::{FF32, FF64};
-use core::num::FpCategory;
 
 macro_rules! forward_freeze_ty {
     ($fast_ty:ident, $base_ty:ident
@@ -189,46 +189,7 @@ macro_rules! impl_num_traits {
             }
         }
 
-        /// Because inf and nan are prohibited, the `fast_fp` types correspond more to the `Real`
-        /// trait than the `Float` trait. However in practice some libs require a Float bound when
-        /// they could really use a Real, which would restrict using the `fast_fp` types.
-        impl num_traits::Float for $fast_ty {
-            /// Panics because NaN values are not supported
-            #[inline]
-            fn nan() -> Self {
-                panic!(concat!(
-                    stringify!($fast_ty),
-                    " does not support NaN values"
-                ));
-            }
-
-            /// Panics because infinite values are not supported
-            ///
-            /// Consider using [`max_value`](num_traits::Float::max_value) as appropriate instead
-            #[inline]
-            fn infinity() -> Self {
-                panic!(concat!(
-                    stringify!($fast_ty),
-                    " does not support infinite values. Consider using `max_value` for comparisons"
-                ));
-            }
-
-            /// Panics because infinite values are not supported
-            ///
-            /// Consider using [`min_value`](num_traits::Float::min_value) as appropriate instead
-            #[inline]
-            fn neg_infinity() -> Self {
-                panic!(concat!(
-                    stringify!($fast_ty),
-                    " does not support infinite values. Consider using `min_value` for comparisons"
-                ));
-            }
-
-            #[inline]
-            fn neg_zero() -> Self {
-                -Self::ZERO
-            }
-
+        impl num_traits::real::Real for $fast_ty {
             #[inline]
             fn min_value() -> Self {
                 $fast_ty::MIN
@@ -249,25 +210,8 @@ macro_rules! impl_num_traits {
                 <$fast_ty>::new($base_ty::EPSILON)
             }
 
-            #[inline]
-            fn is_nan(self) -> bool {
-                false
-            }
-
-            #[inline]
-            fn is_infinite(self) -> bool {
-                false
-            }
-
-            #[inline]
-            fn is_finite(self) -> bool {
-                true
-            }
-
             forward_self! {
                 $fast_ty, $base_ty
-                fn is_normal(self) -> bool;
-                fn classify(self) -> FpCategory;
                 fn floor(self) -> Self;
                 fn ceil(self) -> Self;
                 fn round(self) -> Self;
@@ -317,11 +261,6 @@ macro_rules! impl_num_traits {
                 #[allow(deprecated)]
                 fn abs_sub(self, other: Self) -> Self;
             }
-
-            #[inline]
-            fn integer_decode(self) -> (u64, i16, i8) {
-                <$base_ty as num_traits::Float>::integer_decode(self.freeze_raw())
-            }
         }
     };
 }