Improve readme, ditch features, don't impl Float

2021-12-20 21:47:30 -08:00
parent 63647ad28d
commit 36605ab399
8 changed files with 126 additions and 218 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,25 +13,9 @@ bench = false
 name = "operations"
 harness = false
 [[bench]]
 name = "math"
 harness = false
 [features]
 default = ["num-traits"]
 # disable-able fast-math features
 no-finite-math-only = []
 no-associative-math = []
 no-reciprocal-math = []
 signed-zeros = []
 trapping-math = []
 fp-contract-on = []
 no-approx-func = []
 math-errno = []
 # TODO denormal-fp-math? can have cpu-wide consequences
 # optional trait implementations
 nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
 nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Fast Floating-Point Math
-`fast_fp` provides a set of primitive types that support [fast-math]
+`fast_fp` provides a set of primitive types that support [fast-math] compiler
 optimizations for many operations. These optimizations allow the compiler to
 potentially generate faster code by relaxing some of the requirements of [IEEE
 754] floating-point arithmetic.
@@ -12,29 +12,71 @@ is lost in the overall computation. Note that there are also cases where
 fast-math optimizations can _improve_ precision, such as contracting separate
 multiplication and addition into a fused multiply-add operation.
-## Limitations
+## Caveats
-In order to enable these optimizations safely, certain requirements must be
+### Performance
-observed:
+Use of this crate's primitives may not be faster than the standard primitives
 in all cases. That may be because the generated code is slower in practice, or
 because of certain measures taken by this crate to prevent Undefined Behavior
 (in particular for comparison heavy code). Users should carefully measure and
 benchmark their code to understand whether they actually benefit from use of
 these types.
- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an
+### Finite Math
-	operation are, or the results of an operation _would_ be, `+inf`, `-inf`,
+By default, the `finite-math-only` optimization flag is enabled. With this
-	or `NaN`, then the operation's result value is unspecified. This crate goes
+enabled, the user must ensure that operations on the fast types **do not**
-	to lengths to ensure that such an operation is not Undefined Behavior in the
+involve infinite or NaN values. If the arguments to an operation are, or the
-	strict sense, but the output is free to be any representable value of the
+results of an operation _would_ be, `+inf`, `-inf`, or `NaN`, then the
-	output type, and may not be a fixed value at all.
+operation's result value is unspecified. This crate goes to lengths to ensure
- Use of this crate's primitives may not be faster than the standard primitives
+that such an operation is not UB in the strict sense, but the output is free to
-	in all cases. That may be because the generated code is slower in practice,
+be any representable value of the output type, and may not be a fixed value at
-	or because of certain measures taken by this crate to prevent UB (in
+all.
 	particular for comparison heavy code). Users should carefully measure and
 	benchmark their code to understand whether they actually benefit from use of
 	these types.
 - The safety of this crate is only assessed against rustc's LLVM code
 	generation. This crate should not be used with alternative code generators
 	such as cranelift or GCC
 - Signed-ness of zeros may be treated as insignificant and not preserved
-[TODO]: # (is there a way to detect the code generator at build time?)
+### Building
 `fast_fp` enables fast-math optimizations by calling C code which was compiled
 with these optimizations enabled; additionally, some LLVM IR is used to prevent
 triggering UB that is otherwise possible with these optimizations. As a
 consequence, building this crate requires `clang` to be installed _and_
 requires the final binary to be linked using cross-language LTO to achieve the
 performance benefits.
 This LTO requires a version of clang compatible with the LLVM version used by
 rustc. To find the necessary LLVM version, check rustc's version info in
 verbose mode:
 ```shell
 $ rustc -vV
 rustc 1.56.0 (09c42c458 2021-10-18)
 binary: rustc
 commit-hash: 09c42c45858d5f3aedfa670698275303a3d19afa
 commit-date: 2021-10-18
 host: x86_64-unknown-linux-gnu
 release: 1.56.0
 LLVM version: 13.0.0 # <--- see the version here
 ```
 Then build and link using a `clang` and `lld` with the corresponding version:
 ```shell
 $ CC="clang-13" \
 RUSTFLAGS="-Clinker-plugin-lto -Clinker=clang-13 -Clink-arg=-fuse-ld=lld-13" \
 cargo build
 ```
 For simplicity, these arguments can be stored in a [cargo config] file
 ```toml
 [env]
 CC = "clang-13"
 [build]
 rustflags = ["-Clinker-plugin-lto", "-Clinker=clang-13", "-Clink-arg=-fuse-ld=lld-13"]
 ```
 Although rustc does not always use an official LLVM release version, it's
 typically close enough to be interoperable with the official clang and LLVM
 releases of the same version number.
 [fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags
 [IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754
 [cargo config]: https://doc.rust-lang.org/cargo/reference/config.html
--- a/benches/math.rs
+++ b/benches/math.rs
@@ -1,38 +0,0 @@
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use fast_fp::{ff32, FF32};
 use rand::{distributions::Standard, thread_rng, Rng};
 fn min(c: &mut Criterion) {
    let mut group = c.benchmark_group("min");
    for count in [2, 8, 32, 1024] {
        group.throughput(Throughput::Elements(count as u64));
        let f32_vals = thread_rng()
            .sample_iter(Standard)
            .take(count)
            .collect::<Vec<f32>>();
        // use the same values for both benchmarks
        let ff32_vals = f32_vals
            .clone()
            .into_iter()
            .map(ff32)
            .collect::<Vec<FF32>>();
        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
            b.iter(|| vals.iter().copied().fold(f32::MAX, |acc, val| acc.min(val)));
        });
        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
            b.iter(|| {
                vals.iter()
                    .copied()
                    .fold(FF32::MAX, |acc, val| acc.min(val))
            });
        });
    }
    group.finish();
 }
 criterion_group!(benches, min);
 criterion_main!(benches);
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -3,12 +3,12 @@ use criterion::{
    BenchmarkId, Criterion, Throughput,
 };
 use fast_fp::{ff32, ff64, FF32, FF64};
 use ops::{Add, Div, Mul};
 use rand::{
    distributions::{self, Distribution},
    rngs::StdRng,
    Rng, SeedableRng,
 };
 use std::ops::{Add, Div, Mul};
 fn add(c: &mut Criterion) {
    let mut group = c.benchmark_group("add");
@@ -18,9 +18,9 @@ fn add(c: &mut Criterion) {
    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
    // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::add, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::add, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
 }
@@ -34,9 +34,9 @@ fn mul(c: &mut Criterion) {
    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
    // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::mul, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::mul, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
 }
@@ -50,9 +50,9 @@ fn div(c: &mut Criterion) {
    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
    // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::div, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::div, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
 }
@@ -64,9 +64,9 @@ fn min(c: &mut Criterion) {
    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
    // clone the rng for each benched type to keep the generated values identical
-    fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
+    fold(&mut group, "f32", f32::min, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
-    fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
+    fold(&mut group, "f64", f64::min, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
 }
--- a/build.rs
+++ b/build.rs
@@ -24,34 +24,21 @@ fn build_ll(mut builder: cc::Build) {
 fn build_c(mut builder: cc::Build) {
    builder.opt_level(3);
-    #[cfg(not(feature = "no-associative-math"))]
+    // TODO control flags with generics
    builder.flag("-fassociative-math");
    #[cfg(not(feature = "no-reciprocal-math"))]
    builder.flag("-freciprocal-math");
    #[cfg(not(feature = "signed-zeros"))]
    builder.flag("-fno-signed-zeros");
    #[cfg(not(feature = "trapping-math"))]
    builder.flag("-fno-trapping-math");
    #[cfg(not(feature = "fp-contract-on"))]
    builder.flag("-ffp-contract=fast");
    // -fapprox-func isn't currently available in the driver, but it is in clang itself
    // https://reviews.llvm.org/D106191
    #[cfg(not(feature = "no-approx-func"))]
    builder.flag("-Xclang").flag("-fapprox-func");
    #[cfg(not(feature = "math-errno"))]
    builder.flag("-fno-math-errno");
    // poison_unsafe must be compiled without finite-math-only
    // see its docs for details
    poison_unsafe(builder.clone());
    #[cfg(not(feature = "no-finite-math-only"))]
    builder.flag("-ffinite-math-only");
    poison_safe(builder);
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,26 +45,37 @@ mod num_traits;
 mod poison;
 use poison::MaybePoison;
 /// The error returned by the checked constructors of [`FF32`] and [`FF64`]
 #[derive(Clone, Debug, PartialEq)]
 pub struct InvalidValueError {
    _priv: (),
 }
 impl fmt::Display for InvalidValueError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.write_str("value may not be infinite or NaN")
    }
 }
 impl std::error::Error for InvalidValueError {}
 // The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified
 // values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison
 // values have a relatively consistent behavior (stuff like transitivity), defined cases for UB,
 // and importantly can be limited in scope by freezing to a fixed value.
 //
-// FIXME more docs
+// This library manages these poison values to prevent UB. On the rust side, potentially-poison
 // values are stored in a `MaybePoison` type, similar to the std's `MaybeUninit`. This helps ensure
 // that the values would not trigger UB based on rust's semantics (for example, avoiding questions
 // of whether all bit patterns of a primitive are valid). On the C side, operations are split into
 // two groups: poison "safe" and poison "unsafe". Poison safe operations are ones which can accept
 // any input value without triggering any UB. The operation may produce a poison value, for example
 // `1.f / 0.f` with finite-math-only enabled, but not UB. Poison unsafe operations are ones which
 // could trigger UB for some input value(s). These two definitions follow LLVM's documentation on
 // poison, which explains poison can be relaxed to any value for a type, including `undef`.
 // Therefore, if poison is passed to an operation it could be relaxed to any value; if some value
 // could trigger UB, then so can poison.
 //
 // Poison safe operations are called with input values normally. They don't produce UB, so it's
 // safe to call no matter the input. The operation is assumed to potentially produce poison itself,
 // so the output is always wrapped in a `MaybePoison`.
 //
 // Poison unsafe operations must take certain precautions. First, any input arguments that are
 // `MaybePoison` are frozen using LLVM's `freeze` instruction. This produces a value with an
 // unspecified, but fixed, value which now won't be relaxed any further. Additionally, these
 // operations are compiled without any flags that potentially introduce poison, regardless of
 // enabled crate features. This ensures that the operation internally should not produce any poison
 // regardless of input value. These two steps together preclude any poison values, which should
 // prevent UB (assuming the operation was safe to call in the first place).
 //
 // All operations in rust are considered poison unsafe, and therefore must always freeze the value
 // before using it. Freezing produces a regular f32/f64
 //
 // Prior art and references
 //
@@ -87,6 +98,20 @@ impl std::error::Error for InvalidValueError {}
 // https://github.com/rust-lang/unsafe-code-guidelines/issues/71
 // notes on the validity of primitive bit patterns
 /// The error returned by the checked constructors of [`FF32`] and [`FF64`]
 #[derive(Clone, Debug, PartialEq)]
 pub struct InvalidValueError {
    _priv: (),
 }
 impl fmt::Display for InvalidValueError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.write_str("value may not be infinite or NaN")
    }
 }
 impl std::error::Error for InvalidValueError {}
 /// A wrapper over `f32` which enables some fast-math optimizations.
 // TODO how best to document unspecified values, including witnessing possibly varying values
 #[derive(Clone, Copy)]
@@ -424,42 +449,6 @@ macro_rules! impls {
            }
        }
        // FIXME feature conditional Eq/Ord
        impl Eq for $fast_ty {}
        impl Ord for $fast_ty {
            #[inline(always)]
            fn cmp(&self, other: &$fast_ty) -> cmp::Ordering {
                let this = self.freeze_raw();
                let that = other.freeze_raw();
                // Note NaNs are not supported (and would break everything else anyway) so we ignore them
                // and implement full Ord
                if this < that {
                    cmp::Ordering::Less
                } else if this > that {
                    cmp::Ordering::Greater
                } else {
                    cmp::Ordering::Equal
                }
            }
            #[inline]
            fn min(self, other: $fast_ty) -> $fast_ty {
                <$fast_ty>::min(self, other)
            }
            #[inline]
            fn max(self, other: $fast_ty) -> $fast_ty {
                <$fast_ty>::max(self, other)
            }
            #[inline]
            fn clamp(self, min: $fast_ty, max: $fast_ty) -> $fast_ty {
                <$fast_ty>::clamp(self, min, max)
            }
        }
        impl From<$fast_ty> for $base_ty {
            #[inline(always)]
            fn from(from: $fast_ty) -> Self {
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -80,14 +80,16 @@ macro_rules! poison_safe_fns {
                    // primitive type, we can pass them directly over FFI
                    fn [<$fn _ $base_ty>](a: $fast_ty $(, $arg: $fast_ty)*) -> $fast_ty;
                }
            )*
-                impl $fast_ty {
+            impl $fast_ty {
                $(
                    #[inline]
                    pub fn $fn(self $(, $arg: Self)*) -> Self {
                        unsafe { [<$fn _ $base_ty>](self $(, $arg)*) }
                    }
-                }
+                )*
-            )*
+            }
        }
    }
 }
@@ -104,13 +106,16 @@ macro_rules! poison_unsafe_fns {
                    fn [<$fn _ $base_ty>](a: $base_ty $(, $arg: $base_ty)*) -> $fast_ty;
                }
-                impl $fast_ty {
+            )*
            impl $fast_ty {
                $(
                    #[inline]
                    pub fn $fn(self $(, $arg: Self)*) -> Self {
                        unsafe { [<$fn _ $base_ty>](self.freeze_raw() $(, $arg.freeze_raw())*) }
                    }
-                }
+                )*
-            )*
+            }
        }
    }
 }
--- a/src/num_traits.rs
+++ b/src/num_traits.rs
@@ -1,7 +1,7 @@
 #![cfg(feature = "num-traits")]
 #![cfg_attr(docsrs, doc(cfg(feature = "num-traits")))]
 use crate::{FF32, FF64};
 use core::num::FpCategory;
 macro_rules! forward_freeze_ty {
    ($fast_ty:ident, $base_ty:ident
@@ -189,46 +189,7 @@ macro_rules! impl_num_traits {
            }
        }
-        /// Because inf and nan are prohibited, the `fast_fp` types correspond more to the `Real`
+        impl num_traits::real::Real for $fast_ty {
        /// trait than the `Float` trait. However in practice some libs require a Float bound when
        /// they could really use a Real, which would restrict using the `fast_fp` types.
        impl num_traits::Float for $fast_ty {
            /// Panics because NaN values are not supported
            #[inline]
            fn nan() -> Self {
                panic!(concat!(
                    stringify!($fast_ty),
                    " does not support NaN values"
                ));
            }
            /// Panics because infinite values are not supported
            ///
            /// Consider using [`max_value`](num_traits::Float::max_value) as appropriate instead
            #[inline]
            fn infinity() -> Self {
                panic!(concat!(
                    stringify!($fast_ty),
                    " does not support infinite values. Consider using `max_value` for comparisons"
                ));
            }
            /// Panics because infinite values are not supported
            ///
            /// Consider using [`min_value`](num_traits::Float::min_value) as appropriate instead
            #[inline]
            fn neg_infinity() -> Self {
                panic!(concat!(
                    stringify!($fast_ty),
                    " does not support infinite values. Consider using `min_value` for comparisons"
                ));
            }
            #[inline]
            fn neg_zero() -> Self {
                -Self::ZERO
            }
            #[inline]
            fn min_value() -> Self {
                $fast_ty::MIN
@@ -249,25 +210,8 @@ macro_rules! impl_num_traits {
                <$fast_ty>::new($base_ty::EPSILON)
            }
            #[inline]
            fn is_nan(self) -> bool {
                false
            }
            #[inline]
            fn is_infinite(self) -> bool {
                false
            }
            #[inline]
            fn is_finite(self) -> bool {
                true
            }
            forward_self! {
                $fast_ty, $base_ty
                fn is_normal(self) -> bool;
                fn classify(self) -> FpCategory;
                fn floor(self) -> Self;
                fn ceil(self) -> Self;
                fn round(self) -> Self;
@@ -317,11 +261,6 @@ macro_rules! impl_num_traits {
                #[allow(deprecated)]
                fn abs_sub(self, other: Self) -> Self;
            }
            #[inline]
            fn integer_decode(self) -> (u64, i16, i8) {
                <$base_ty as num_traits::Float>::integer_decode(self.freeze_raw())
            }
        }
    };
 }