diff --git a/Cargo.toml b/Cargo.toml index 2132a06..61e1e5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,17 @@ edition = "2018" readme = "README.md" license = "MIT OR Apache-2.0" +[lib] +bench = false + [[bench]] name = "operations" harness = false +[[bench]] +name = "math" +harness = false + [features] default = ["num-traits"] @@ -17,6 +24,8 @@ nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"] nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"] [dependencies] +paste = "1" + num-traits = { version = "0.2", optional = true } approx_v03 = { package = "approx", version = "0.3", optional = true } @@ -27,6 +36,9 @@ approx_v05 = { package = "approx", version = "0.5", optional = true } nalgebra_v029 = { package = "nalgebra", version = "0.29", optional = true } simba_v06 = { package = "simba", version = "0.6", optional = true } +[build-dependencies] +cc = "1" + [dev-dependencies] criterion = { version = "0.3", features = ["html_reports"] } rand = "0.8" @@ -34,3 +46,7 @@ rand = "0.8" [profile.test] # run tests at high optimization to exercise typical codegen opt-level = 3 + +[profile.release] +lto = "fat" +codegen-units = 1 diff --git a/benches/math.rs b/benches/math.rs new file mode 100644 index 0000000..9dbc7a1 --- /dev/null +++ b/benches/math.rs @@ -0,0 +1,38 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use fast_fp::{ff32, FF32}; +use rand::{distributions::Standard, thread_rng, Rng}; + +fn min(c: &mut Criterion) { + let mut group = c.benchmark_group("min"); + for count in [2, 8, 32, 1024] { + group.throughput(Throughput::Elements(count as u64)); + + let f32_vals = thread_rng() + .sample_iter(Standard) + .take(count) + .collect::>(); + + // use the same values for both benchmarks + let ff32_vals = f32_vals + .clone() + .into_iter() + .map(ff32) + .collect::>(); + + group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| { + b.iter(|| vals.iter().copied().fold(f32::MAX, |acc, val| acc.min(val))); + }); + + group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| { + b.iter(|| { + vals.iter() + .copied() + .fold(FF32::MAX, |acc, val| acc.min(val)) + }); + }); + } + group.finish(); +} + +criterion_group!(benches, min); +criterion_main!(benches); diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..7ac7c0f --- /dev/null +++ b/build.rs @@ -0,0 +1,20 @@ +fn main() { + let mut builder = cc::Build::new(); + + if !builder.get_compiler().is_like_clang() { + // if the default/configured cc is not clang, try to call clang manually + builder.compiler("clang"); + } + + builder + .file("src/math/math.c") + .flag("-O3") + .flag("-flto=thin") + .flag("-ffinite-math-only") + .flag("-fassociative-math") + .flag("-freciprocal-math") + .flag("-fno-signed-zeros") + .flag("-fno-trapping-math") + .flag("-ffp-contract=fast") + .compile("math") +} diff --git a/src/lib.rs b/src/lib.rs index 8e8be61..e886e9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,8 @@ #![feature(core_intrinsics)] // intrinsics for the fast math #![feature(asm)] // asm used to emulate freeze #![feature(doc_cfg)] +#![feature(link_llvm_intrinsics)] + use core::{ cmp, fmt, intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast}, @@ -40,6 +42,7 @@ macro_rules! forward_freeze_self { }; } +mod math; mod nalgebra; mod num_traits; @@ -347,7 +350,6 @@ macro_rules! impls { // TODO migrate these to native implementations to freeze less and fast-math more forward_freeze_self! { $fast_ty, $base_ty - pub fn abs(self) -> Self; pub fn acos(self) -> Self; pub fn acosh(self) -> Self; pub fn asin(self) -> Self; @@ -358,7 +360,6 @@ macro_rules! impls { pub fn cbrt(self) -> Self; pub fn ceil(self) -> Self; pub fn clamp(self, min: Self, max: Self) -> Self; - pub fn copysign(self, sign: Self) -> Self; pub fn cos(self) -> Self; pub fn cosh(self) -> Self; pub fn div_euclid(self, rhs: Self) -> Self; @@ -367,20 +368,17 @@ macro_rules! impls { pub fn exp_m1(self) -> Self; pub fn floor(self) -> Self; pub fn fract(self) -> Self; - pub fn hypot(self, other: Self) -> Self; pub fn ln(self) -> Self; pub fn ln_1p(self) -> Self; pub fn log(self, base: Self) -> Self; pub fn log10(self) -> Self; pub fn log2(self) -> Self; - pub fn max(self, other: Self) -> Self; - pub fn min(self, other: Self) -> Self; + //pub fn max(self, other: Self) -> Self; + //pub fn min(self, other: Self) -> Self; pub fn mul_add(self, a: Self, b: Self) -> Self; pub fn powf(self, n: Self) -> Self; - pub fn recip(self) -> Self; pub fn rem_euclid(self, rhs: Self) -> Self; pub fn round(self) -> Self; - pub fn signum(self) -> Self; pub fn sin(self) -> Self; pub fn sinh(self) -> Self; pub fn sqrt(self) -> Self; diff --git a/src/math/math.c b/src/math/math.c new file mode 100644 index 0000000..8eec462 --- /dev/null +++ b/src/math/math.c @@ -0,0 +1,76 @@ +#include +#include + +#define IMPL_OPERATIONS(C_TYPE, RUST_TYPE) \ + /* TODO figure out why these don't inline */ \ + __attribute__((always_inline)) \ + C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a + b; \ + } \ + \ + __attribute__((always_inline)) \ + C_TYPE sub_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a - b; \ + } \ + \ + __attribute__((always_inline)) \ + C_TYPE mul_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a * b; \ + } \ + \ + __attribute__((always_inline)) \ + C_TYPE div_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a / b; \ + } \ + \ + __attribute__((always_inline)) \ + bool eq_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a == b; \ + } \ + \ + __attribute__((always_inline)) \ + bool lt_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a < b; \ + } \ + \ + __attribute__((always_inline)) \ + bool le_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a <= b; \ + } \ + \ + __attribute__((always_inline)) \ + bool gt_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a > b; \ + } \ + \ + __attribute__((always_inline)) \ + bool ge_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return a >= b; \ + } \ + +#define IMPL_UNARY_FUNCTION(C_TYPE, RUST_TYPE, FN_NAME, FN_IMPL) \ + __attribute__((always_inline)) \ + C_TYPE FN_NAME ## _ ## RUST_TYPE(C_TYPE a) { \ + return FN_IMPL(a); \ + } \ + +#define IMPL_BINARY_FUNCTION(C_TYPE, RUST_TYPE, FN_NAME, FN_IMPL) \ + __attribute__((always_inline)) \ + C_TYPE FN_NAME ## _ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ + return FN_IMPL(a, b); \ + } \ + +IMPL_OPERATIONS(float, f32) +IMPL_OPERATIONS(double, f64) + +IMPL_UNARY_FUNCTION(float, f32, sqrt, sqrtf) +IMPL_UNARY_FUNCTION(double, f64, sqrt, sqrt) + +IMPL_BINARY_FUNCTION(float, f32, rem, fmodf) +IMPL_BINARY_FUNCTION(double, f64, rem, fmod) + +IMPL_BINARY_FUNCTION(float, f32, max, fmaxf) +IMPL_BINARY_FUNCTION(double, f64, max, fmax) + +IMPL_BINARY_FUNCTION(float, f32, min, fminf) +IMPL_BINARY_FUNCTION(double, f64, min, fmin) diff --git a/src/math/mod.rs b/src/math/mod.rs new file mode 100644 index 0000000..93313cb --- /dev/null +++ b/src/math/mod.rs @@ -0,0 +1,105 @@ +use crate::{poison::MaybePoison, FF32, FF64}; +use paste::paste; + +impl FF32 { + const SIGN_BIT: u32 = 0x8000_0000; + const UNSIGNED_MASK: u32 = 0x7fff_ffff; +} + +impl FF64 { + const SIGN_BIT: u64 = 0x8000_0000_0000_0000; + const UNSIGNED_MASK: u64 = 0x7fff_ffff_ffff_ffff; +} + +macro_rules! impl_generic_math { + ($fast_ty:ident, $base_ty:ident, $base_int:ident) => { + impl $fast_ty { + #[inline] + fn to_bits(self) -> MaybePoison<$base_int> { + // Safety: + // + // - `to_bits` should be valid for any input bits + // - poison propagation is controlled with MaybePoison + MaybePoison::new(unsafe { <$base_ty>::to_bits(*self.0.maybe_poison().as_ptr()) }) + } + + #[inline] + fn from_bits(bits: MaybePoison<$base_int>) -> Self { + // Safety: + // + // - `from_bits` should be valid for any input bits + // - poison propagation is controlled with MaybePoison + Self(MaybePoison::new(unsafe { + <$base_ty>::from_bits(*bits.maybe_poison().as_ptr()) + })) + } + + #[inline] + pub fn abs(self) -> Self { + let bits = self.to_bits(); + <$fast_ty>::from_bits(MaybePoison::new(unsafe { + *bits.maybe_poison().as_ptr() & Self::UNSIGNED_MASK + })) + } + + #[inline] + pub fn copysign(self, other: Self) -> Self { + let this = self.to_bits(); + let that = other.to_bits(); + + // Safety: + // + // - & of poison is safe because & does not produce UB for any input values + // - poison propagation is handled by wrapping in maybe poison + <$fast_ty>::from_bits(MaybePoison::new(unsafe { + (*this.maybe_poison().as_ptr() & Self::UNSIGNED_MASK) + | (*that.maybe_poison().as_ptr() & Self::SIGN_BIT) + })) + } + + #[inline] + pub fn hypot(self, other: Self) -> Self { + (self * self + other * other).sqrt() + } + + #[inline] + pub fn signum(self) -> Self { + Self::ONE.copysign(self) + } + + #[inline] + pub fn recip(self) -> Self { + Self::ONE / self + } + } + }; +} + +macro_rules! impl_extern_math { + ($fast_ty:ident, $base_ty:ident) => { + paste! { + extern "C" { + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + } + + impl $fast_ty { + #[inline] + pub fn max(self, other: Self) -> Self { + unsafe { [](self, other) } + } + + #[inline] + pub fn min(self, other: Self) -> Self { + unsafe { [](self, other) } + } + } + } + }; +} + +impl_generic_math! { FF32, f32, u32 } +impl_generic_math! { FF64, f64, u64 } + +impl_extern_math! { FF32, f32 } +impl_extern_math! { FF64, f64 }