From 36605ab3993c82ca5edaa39c73c135a1c345bff6 Mon Sep 17 00:00:00 2001 From: Renar Narubin Date: Mon, 20 Dec 2021 21:47:30 -0800 Subject: [PATCH] Improve readme, ditch features, don't impl Float --- Cargo.toml | 16 -------- README.md | 84 +++++++++++++++++++++++++++++---------- benches/math.rs | 38 ------------------ benches/operations.rs | 18 ++++----- build.rs | 15 +------ src/lib.rs | 91 +++++++++++++++++++------------------------ src/math/mod.rs | 17 +++++--- src/num_traits.rs | 65 +------------------------------ 8 files changed, 126 insertions(+), 218 deletions(-) delete mode 100644 benches/math.rs diff --git a/Cargo.toml b/Cargo.toml index cf2d9ed..37b4422 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,25 +13,9 @@ bench = false name = "operations" harness = false -[[bench]] -name = "math" -harness = false - [features] default = ["num-traits"] -# disable-able fast-math features -no-finite-math-only = [] -no-associative-math = [] -no-reciprocal-math = [] -signed-zeros = [] -trapping-math = [] -fp-contract-on = [] -no-approx-func = [] -math-errno = [] - -# TODO denormal-fp-math? can have cpu-wide consequences - # optional trait implementations nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"] nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"] diff --git a/README.md b/README.md index 8fd7ad9..32f38a5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Fast Floating-Point Math -`fast_fp` provides a set of primitive types that support [fast-math] +`fast_fp` provides a set of primitive types that support [fast-math] compiler optimizations for many operations. These optimizations allow the compiler to potentially generate faster code by relaxing some of the requirements of [IEEE 754] floating-point arithmetic. @@ -12,29 +12,71 @@ is lost in the overall computation. Note that there are also cases where fast-math optimizations can _improve_ precision, such as contracting separate multiplication and addition into a fused multiply-add operation. -## Limitations +## Caveats -In order to enable these optimizations safely, certain requirements must be -observed: +### Performance +Use of this crate's primitives may not be faster than the standard primitives +in all cases. That may be because the generated code is slower in practice, or +because of certain measures taken by this crate to prevent Undefined Behavior +(in particular for comparison heavy code). Users should carefully measure and +benchmark their code to understand whether they actually benefit from use of +these types. -- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an - operation are, or the results of an operation _would_ be, `+inf`, `-inf`, - or `NaN`, then the operation's result value is unspecified. This crate goes - to lengths to ensure that such an operation is not Undefined Behavior in the - strict sense, but the output is free to be any representable value of the - output type, and may not be a fixed value at all. -- Use of this crate's primitives may not be faster than the standard primitives - in all cases. That may be because the generated code is slower in practice, - or because of certain measures taken by this crate to prevent UB (in - particular for comparison heavy code). Users should carefully measure and - benchmark their code to understand whether they actually benefit from use of - these types. -- The safety of this crate is only assessed against rustc's LLVM code - generation. This crate should not be used with alternative code generators - such as cranelift or GCC -- Signed-ness of zeros may be treated as insignificant and not preserved +### Finite Math +By default, the `finite-math-only` optimization flag is enabled. With this +enabled, the user must ensure that operations on the fast types **do not** +involve infinite or NaN values. If the arguments to an operation are, or the +results of an operation _would_ be, `+inf`, `-inf`, or `NaN`, then the +operation's result value is unspecified. This crate goes to lengths to ensure +that such an operation is not UB in the strict sense, but the output is free to +be any representable value of the output type, and may not be a fixed value at +all. -[TODO]: # (is there a way to detect the code generator at build time?) +### Building +`fast_fp` enables fast-math optimizations by calling C code which was compiled +with these optimizations enabled; additionally, some LLVM IR is used to prevent +triggering UB that is otherwise possible with these optimizations. As a +consequence, building this crate requires `clang` to be installed _and_ +requires the final binary to be linked using cross-language LTO to achieve the +performance benefits. + +This LTO requires a version of clang compatible with the LLVM version used by +rustc. To find the necessary LLVM version, check rustc's version info in +verbose mode: + +```shell +$ rustc -vV +rustc 1.56.0 (09c42c458 2021-10-18) +binary: rustc +commit-hash: 09c42c45858d5f3aedfa670698275303a3d19afa +commit-date: 2021-10-18 +host: x86_64-unknown-linux-gnu +release: 1.56.0 +LLVM version: 13.0.0 # <--- see the version here +``` + +Then build and link using a `clang` and `lld` with the corresponding version: + +```shell +$ CC="clang-13" \ +RUSTFLAGS="-Clinker-plugin-lto -Clinker=clang-13 -Clink-arg=-fuse-ld=lld-13" \ +cargo build +``` + +For simplicity, these arguments can be stored in a [cargo config] file + +```toml +[env] +CC = "clang-13" + +[build] +rustflags = ["-Clinker-plugin-lto", "-Clinker=clang-13", "-Clink-arg=-fuse-ld=lld-13"] +``` + +Although rustc does not always use an official LLVM release version, it's +typically close enough to be interoperable with the official clang and LLVM +releases of the same version number. [fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags [IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754 +[cargo config]: https://doc.rust-lang.org/cargo/reference/config.html diff --git a/benches/math.rs b/benches/math.rs deleted file mode 100644 index 9dbc7a1..0000000 --- a/benches/math.rs +++ /dev/null @@ -1,38 +0,0 @@ -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use fast_fp::{ff32, FF32}; -use rand::{distributions::Standard, thread_rng, Rng}; - -fn min(c: &mut Criterion) { - let mut group = c.benchmark_group("min"); - for count in [2, 8, 32, 1024] { - group.throughput(Throughput::Elements(count as u64)); - - let f32_vals = thread_rng() - .sample_iter(Standard) - .take(count) - .collect::>(); - - // use the same values for both benchmarks - let ff32_vals = f32_vals - .clone() - .into_iter() - .map(ff32) - .collect::>(); - - group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| { - b.iter(|| vals.iter().copied().fold(f32::MAX, |acc, val| acc.min(val))); - }); - - group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| { - b.iter(|| { - vals.iter() - .copied() - .fold(FF32::MAX, |acc, val| acc.min(val)) - }); - }); - } - group.finish(); -} - -criterion_group!(benches, min); -criterion_main!(benches); diff --git a/benches/operations.rs b/benches/operations.rs index 32b6b92..3002682 100644 --- a/benches/operations.rs +++ b/benches/operations.rs @@ -3,12 +3,12 @@ use criterion::{ BenchmarkId, Criterion, Throughput, }; use fast_fp::{ff32, ff64, FF32, FF64}; +use ops::{Add, Div, Mul}; use rand::{ distributions::{self, Distribution}, rngs::StdRng, Rng, SeedableRng, }; -use std::ops::{Add, Div, Mul}; fn add(c: &mut Criterion) { let mut group = c.benchmark_group("add"); @@ -18,9 +18,9 @@ fn add(c: &mut Criterion) { let f64s = distributions::Uniform::::new(0.0, 1.0); // clone the rng for each benched type to keep the generated values identical - fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s); + fold(&mut group, "f32", f32::add, 0.0, rng.clone(), f32s); fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s); - fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s); + fold(&mut group, "f64", f64::add, 0.0, rng.clone(), f64s); fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s); } @@ -34,9 +34,9 @@ fn mul(c: &mut Criterion) { let f64s = distributions::Uniform::::new(0.9, 1.1); // clone the rng for each benched type to keep the generated values identical - fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s); + fold(&mut group, "f32", f32::mul, 0.0, rng.clone(), f32s); fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s); - fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s); + fold(&mut group, "f64", f64::mul, 0.0, rng.clone(), f64s); fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s); } @@ -50,9 +50,9 @@ fn div(c: &mut Criterion) { let f64s = distributions::Uniform::::new(0.9, 1.1); // clone the rng for each benched type to keep the generated values identical - fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s); + fold(&mut group, "f32", f32::div, 0.0, rng.clone(), f32s); fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s); - fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s); + fold(&mut group, "f64", f64::div, 0.0, rng.clone(), f64s); fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s); } @@ -64,9 +64,9 @@ fn min(c: &mut Criterion) { let f64s = distributions::Uniform::::new(0.0, 1.0); // clone the rng for each benched type to keep the generated values identical - fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s); + fold(&mut group, "f32", f32::min, 0.0, rng.clone(), f32s); fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s); - fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s); + fold(&mut group, "f64", f64::min, 0.0, rng.clone(), f64s); fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s); } diff --git a/build.rs b/build.rs index c2fb8c3..532c48b 100644 --- a/build.rs +++ b/build.rs @@ -24,34 +24,21 @@ fn build_ll(mut builder: cc::Build) { fn build_c(mut builder: cc::Build) { builder.opt_level(3); - #[cfg(not(feature = "no-associative-math"))] + // TODO control flags with generics builder.flag("-fassociative-math"); - - #[cfg(not(feature = "no-reciprocal-math"))] builder.flag("-freciprocal-math"); - - #[cfg(not(feature = "signed-zeros"))] builder.flag("-fno-signed-zeros"); - - #[cfg(not(feature = "trapping-math"))] builder.flag("-fno-trapping-math"); - - #[cfg(not(feature = "fp-contract-on"))] builder.flag("-ffp-contract=fast"); - // -fapprox-func isn't currently available in the driver, but it is in clang itself // https://reviews.llvm.org/D106191 - #[cfg(not(feature = "no-approx-func"))] builder.flag("-Xclang").flag("-fapprox-func"); - - #[cfg(not(feature = "math-errno"))] builder.flag("-fno-math-errno"); // poison_unsafe must be compiled without finite-math-only // see its docs for details poison_unsafe(builder.clone()); - #[cfg(not(feature = "no-finite-math-only"))] builder.flag("-ffinite-math-only"); poison_safe(builder); diff --git a/src/lib.rs b/src/lib.rs index 8737bc0..c08a582 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,26 +45,37 @@ mod num_traits; mod poison; use poison::MaybePoison; -/// The error returned by the checked constructors of [`FF32`] and [`FF64`] -#[derive(Clone, Debug, PartialEq)] -pub struct InvalidValueError { - _priv: (), -} - -impl fmt::Display for InvalidValueError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("value may not be infinite or NaN") - } -} - -impl std::error::Error for InvalidValueError {} - // The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified // values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison // values have a relatively consistent behavior (stuff like transitivity), defined cases for UB, // and importantly can be limited in scope by freezing to a fixed value. // -// FIXME more docs +// This library manages these poison values to prevent UB. On the rust side, potentially-poison +// values are stored in a `MaybePoison` type, similar to the std's `MaybeUninit`. This helps ensure +// that the values would not trigger UB based on rust's semantics (for example, avoiding questions +// of whether all bit patterns of a primitive are valid). On the C side, operations are split into +// two groups: poison "safe" and poison "unsafe". Poison safe operations are ones which can accept +// any input value without triggering any UB. The operation may produce a poison value, for example +// `1.f / 0.f` with finite-math-only enabled, but not UB. Poison unsafe operations are ones which +// could trigger UB for some input value(s). These two definitions follow LLVM's documentation on +// poison, which explains poison can be relaxed to any value for a type, including `undef`. +// Therefore, if poison is passed to an operation it could be relaxed to any value; if some value +// could trigger UB, then so can poison. +// +// Poison safe operations are called with input values normally. They don't produce UB, so it's +// safe to call no matter the input. The operation is assumed to potentially produce poison itself, +// so the output is always wrapped in a `MaybePoison`. +// +// Poison unsafe operations must take certain precautions. First, any input arguments that are +// `MaybePoison` are frozen using LLVM's `freeze` instruction. This produces a value with an +// unspecified, but fixed, value which now won't be relaxed any further. Additionally, these +// operations are compiled without any flags that potentially introduce poison, regardless of +// enabled crate features. This ensures that the operation internally should not produce any poison +// regardless of input value. These two steps together preclude any poison values, which should +// prevent UB (assuming the operation was safe to call in the first place). +// +// All operations in rust are considered poison unsafe, and therefore must always freeze the value +// before using it. Freezing produces a regular f32/f64 // // Prior art and references // @@ -87,6 +98,20 @@ impl std::error::Error for InvalidValueError {} // https://github.com/rust-lang/unsafe-code-guidelines/issues/71 // notes on the validity of primitive bit patterns +/// The error returned by the checked constructors of [`FF32`] and [`FF64`] +#[derive(Clone, Debug, PartialEq)] +pub struct InvalidValueError { + _priv: (), +} + +impl fmt::Display for InvalidValueError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("value may not be infinite or NaN") + } +} + +impl std::error::Error for InvalidValueError {} + /// A wrapper over `f32` which enables some fast-math optimizations. // TODO how best to document unspecified values, including witnessing possibly varying values #[derive(Clone, Copy)] @@ -424,42 +449,6 @@ macro_rules! impls { } } - // FIXME feature conditional Eq/Ord - impl Eq for $fast_ty {} - - impl Ord for $fast_ty { - #[inline(always)] - fn cmp(&self, other: &$fast_ty) -> cmp::Ordering { - let this = self.freeze_raw(); - let that = other.freeze_raw(); - - // Note NaNs are not supported (and would break everything else anyway) so we ignore them - // and implement full Ord - if this < that { - cmp::Ordering::Less - } else if this > that { - cmp::Ordering::Greater - } else { - cmp::Ordering::Equal - } - } - - #[inline] - fn min(self, other: $fast_ty) -> $fast_ty { - <$fast_ty>::min(self, other) - } - - #[inline] - fn max(self, other: $fast_ty) -> $fast_ty { - <$fast_ty>::max(self, other) - } - - #[inline] - fn clamp(self, min: $fast_ty, max: $fast_ty) -> $fast_ty { - <$fast_ty>::clamp(self, min, max) - } - } - impl From<$fast_ty> for $base_ty { #[inline(always)] fn from(from: $fast_ty) -> Self { diff --git a/src/math/mod.rs b/src/math/mod.rs index e044059..47c40fb 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -80,14 +80,16 @@ macro_rules! poison_safe_fns { // primitive type, we can pass them directly over FFI fn [<$fn _ $base_ty>](a: $fast_ty $(, $arg: $fast_ty)*) -> $fast_ty; } + )* - impl $fast_ty { + impl $fast_ty { + $( #[inline] pub fn $fn(self $(, $arg: Self)*) -> Self { unsafe { [<$fn _ $base_ty>](self $(, $arg)*) } } - } - )* + )* + } } } } @@ -104,13 +106,16 @@ macro_rules! poison_unsafe_fns { fn [<$fn _ $base_ty>](a: $base_ty $(, $arg: $base_ty)*) -> $fast_ty; } - impl $fast_ty { + )* + + impl $fast_ty { + $( #[inline] pub fn $fn(self $(, $arg: Self)*) -> Self { unsafe { [<$fn _ $base_ty>](self.freeze_raw() $(, $arg.freeze_raw())*) } } - } - )* + )* + } } } } diff --git a/src/num_traits.rs b/src/num_traits.rs index 5cfa6c8..2c0081e 100644 --- a/src/num_traits.rs +++ b/src/num_traits.rs @@ -1,7 +1,7 @@ #![cfg(feature = "num-traits")] #![cfg_attr(docsrs, doc(cfg(feature = "num-traits")))] + use crate::{FF32, FF64}; -use core::num::FpCategory; macro_rules! forward_freeze_ty { ($fast_ty:ident, $base_ty:ident @@ -189,46 +189,7 @@ macro_rules! impl_num_traits { } } - /// Because inf and nan are prohibited, the `fast_fp` types correspond more to the `Real` - /// trait than the `Float` trait. However in practice some libs require a Float bound when - /// they could really use a Real, which would restrict using the `fast_fp` types. - impl num_traits::Float for $fast_ty { - /// Panics because NaN values are not supported - #[inline] - fn nan() -> Self { - panic!(concat!( - stringify!($fast_ty), - " does not support NaN values" - )); - } - - /// Panics because infinite values are not supported - /// - /// Consider using [`max_value`](num_traits::Float::max_value) as appropriate instead - #[inline] - fn infinity() -> Self { - panic!(concat!( - stringify!($fast_ty), - " does not support infinite values. Consider using `max_value` for comparisons" - )); - } - - /// Panics because infinite values are not supported - /// - /// Consider using [`min_value`](num_traits::Float::min_value) as appropriate instead - #[inline] - fn neg_infinity() -> Self { - panic!(concat!( - stringify!($fast_ty), - " does not support infinite values. Consider using `min_value` for comparisons" - )); - } - - #[inline] - fn neg_zero() -> Self { - -Self::ZERO - } - + impl num_traits::real::Real for $fast_ty { #[inline] fn min_value() -> Self { $fast_ty::MIN @@ -249,25 +210,8 @@ macro_rules! impl_num_traits { <$fast_ty>::new($base_ty::EPSILON) } - #[inline] - fn is_nan(self) -> bool { - false - } - - #[inline] - fn is_infinite(self) -> bool { - false - } - - #[inline] - fn is_finite(self) -> bool { - true - } - forward_self! { $fast_ty, $base_ty - fn is_normal(self) -> bool; - fn classify(self) -> FpCategory; fn floor(self) -> Self; fn ceil(self) -> Self; fn round(self) -> Self; @@ -317,11 +261,6 @@ macro_rules! impl_num_traits { #[allow(deprecated)] fn abs_sub(self, other: Self) -> Self; } - - #[inline] - fn integer_decode(self) -> (u64, i16, i8) { - <$base_ty as num_traits::Float>::integer_decode(self.freeze_raw()) - } } }; }