diff --git a/Cargo.toml b/Cargo.toml index 61e1e5a..927888b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "fast_fp" version = "0.1.0" authors = ["Renar Narubin "] -edition = "2018" +edition = "2021" readme = "README.md" license = "MIT OR Apache-2.0" @@ -18,8 +18,30 @@ name = "math" harness = false [features] -default = ["num-traits"] +default = [ + "num-traits", + "finite-math-only", + "associative-math", + "reciprocal-math", + "no-signed-zeros", + "no-trapping-math", + "fp-contract-fast", + "approx-func", +] +# default fast-math features +finite-math-only = [] +associative-math = [] +reciprocal-math = [] +no-signed-zeros = [] +no-trapping-math = [] +fp-contract-fast = [] +approx-func = [] + +# non-default fast-math-like features +denormal-fp-math-preserve-sign = [] + +# optional trait implementations nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"] nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"] @@ -48,5 +70,5 @@ rand = "0.8" opt-level = 3 [profile.release] -lto = "fat" -codegen-units = 1 +lto="thin" +codegen-units=1 diff --git a/benches/operations.rs b/benches/operations.rs index 34dd228..32b6b92 100644 --- a/benches/operations.rs +++ b/benches/operations.rs @@ -1,54 +1,122 @@ -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use criterion::{ + criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup, + BenchmarkId, Criterion, Throughput, +}; use fast_fp::{ff32, ff64, FF32, FF64}; -use rand::{distributions::Standard, thread_rng, Rng}; +use rand::{ + distributions::{self, Distribution}, + rngs::StdRng, + Rng, SeedableRng, +}; +use std::ops::{Add, Div, Mul}; -fn sum(c: &mut Criterion) { - let mut group = c.benchmark_group("sum"); - for count in [2, 4, 8, 16, 64, 1024, 1 << 15] { - group.throughput(Throughput::Elements(count as u64)); +fn add(c: &mut Criterion) { + let mut group = c.benchmark_group("add"); - let f32_vals = thread_rng() - .sample_iter(Standard) - .take(count) - .collect::>(); + let rng = StdRng::from_entropy(); + let f32s = distributions::Uniform::::new(0.0, 1.0); + let f64s = distributions::Uniform::::new(0.0, 1.0); - // use the same values for both benchmarks - let ff32_vals = f32_vals - .clone() - .into_iter() - .map(ff32) - .collect::>(); - - group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| { - b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val)); - }); - - group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| { - b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val)); - }); - - let f64_vals = thread_rng() - .sample_iter(Standard) - .take(count) - .collect::>(); - - // use the same values for both benchmarks - let ff64_vals = f64_vals - .clone() - .into_iter() - .map(ff64) - .collect::>(); - - group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| { - b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val)); - }); - - group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| { - b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val)); - }); - } - group.finish(); + // clone the rng for each benched type to keep the generated values identical + fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s); + fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s); + fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s); + fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s); } -criterion_group!(benches, sum); +fn mul(c: &mut Criterion) { + let mut group = c.benchmark_group("mul"); + + let rng = StdRng::from_entropy(); + + // try to avoid subnormals/explosions by limiting the values near 1 + let f32s = distributions::Uniform::::new(0.9, 1.1); + let f64s = distributions::Uniform::::new(0.9, 1.1); + + // clone the rng for each benched type to keep the generated values identical + fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s); + fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s); + fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s); + fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s); +} + +fn div(c: &mut Criterion) { + let mut group = c.benchmark_group("div"); + + let rng = StdRng::from_entropy(); + + // try to avoid subnormals/explosions by limiting the values near 1 + let f32s = distributions::Uniform::::new(0.9, 1.1); + let f64s = distributions::Uniform::::new(0.9, 1.1); + + // clone the rng for each benched type to keep the generated values identical + fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s); + fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s); + fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s); + fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s); +} + +fn min(c: &mut Criterion) { + let mut group = c.benchmark_group("min"); + + let rng = StdRng::from_entropy(); + let f32s = distributions::Uniform::::new(0.0, 1.0); + let f64s = distributions::Uniform::::new(0.0, 1.0); + + // clone the rng for each benched type to keep the generated values identical + fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s); + fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s); + fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s); + fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s); +} + +fn fold( + group: &mut BenchmarkGroup<'_, impl Measurement>, + id: &str, + op: impl Fn(T, T) -> T + Copy, + init: T, + mut rng: impl Rng, + vals: impl Distribution + Copy, +) where + T: From + Copy, +{ + fold_count([init; 1], group, id, op, init, &mut rng, vals); + fold_count([init; 2], group, id, op, init, &mut rng, vals); + fold_count([init; 4], group, id, op, init, &mut rng, vals); + fold_count([init; 8], group, id, op, init, &mut rng, vals); + fold_count([init; 64], group, id, op, init, &mut rng, vals); + fold_count([init; 256], group, id, op, init, &mut rng, vals); + fold_count([init; 1024], group, id, op, init, &mut rng, vals); +} + +fn fold_count( + arr: [T; N], + group: &mut BenchmarkGroup<'_, impl Measurement>, + id: &str, + op: impl Fn(T, T) -> T + Copy, + init: T, + mut rng: impl Rng, + vals: impl Distribution + Copy, +) where + T: From + Copy, +{ + group.throughput(Throughput::Elements(N as u64)); + + group.bench_function(BenchmarkId::new(id, N), |b| { + b.iter_batched_ref( + || { + let mut inputs = arr; + inputs + .iter_mut() + .zip((&mut rng).sample_iter(&vals)) + .for_each(|(dst, val)| *dst = T::from(val)); + inputs + }, + |vals| vals.iter().copied().fold(init, op), + BatchSize::SmallInput, + ); + }); +} + +criterion_group!(benches, add, mul, div, min); criterion_main!(benches); diff --git a/build.rs b/build.rs index 343fbf3..04eee83 100644 --- a/build.rs +++ b/build.rs @@ -6,7 +6,7 @@ fn main() { builder.compiler("clang"); } - builder.flag("-O3").flag("-flto=thin"); + builder.flag("-flto=thin"); build_ll(builder.clone()); build_c(builder); @@ -21,13 +21,32 @@ fn build_ll(mut builder: cc::Build) { } fn build_c(mut builder: cc::Build) { - builder - .file("src/math/math.c") - .flag("-ffinite-math-only") - .flag("-fassociative-math") - .flag("-freciprocal-math") - .flag("-fno-signed-zeros") - .flag("-fno-trapping-math") - .flag("-ffp-contract=fast") - .compile("math") + builder.flag("-O3"); + + #[cfg(feature = "finite-math-only")] + builder.flag("-ffinite-math-only"); + + #[cfg(feature = "associative-math")] + builder.flag("-fassociative-math"); + + #[cfg(feature = "reciprocal-math")] + builder.flag("-freciprocal-math"); + + #[cfg(feature = "no-signed-zeros")] + builder.flag("-fno-signed-zeros"); + + #[cfg(feature = "no-trapping-math")] + builder.flag("-fno-trapping-math"); + + #[cfg(feature = "fp-contract-fast")] + builder.flag("-ffp-contract=fast"); + + // TODO figure out if this works + //#[cfg(feature = "approx-func")] + //builder.flag("-Xclang -fapprox-func"); + + #[cfg(feature = "denormal-fp-math-preserve-sign")] + builder.flag("-fdenormal-fp-math=preserve-sign"); + + builder.file("src/math/math.c").compile("math") } diff --git a/src/lib.rs b/src/lib.rs index 5c0dba2..cbbefff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,8 @@ #![doc = include_str!("../README.md")] -#![feature(core_intrinsics)] // intrinsics for the fast math -#![feature(asm)] // asm used to emulate freeze #![feature(doc_cfg)] -#![feature(link_llvm_intrinsics)] use core::{ cmp, fmt, - intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast}, iter::{Product, Sum}, num::FpCategory, ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign}, @@ -137,82 +133,6 @@ pub fn ff64(f: f64) -> FF64 { FF64::new(f) } -macro_rules! impl_binary_refs { - ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => { - impl $op_trait<$rhs> for &$lhs { - type Output = <$lhs as $op_trait<$rhs>>::Output; - - #[inline] - fn $op_fn(self, other: $rhs) -> Self::Output { - (*self).$op_fn(other) - } - } - impl $op_trait<&$rhs> for $lhs { - type Output = <$lhs as $op_trait<$rhs>>::Output; - - #[inline] - fn $op_fn(self, other: &$rhs) -> Self::Output { - self.$op_fn(*other) - } - } - impl $op_trait<&$rhs> for &$lhs { - type Output = <$lhs as $op_trait<$rhs>>::Output; - - #[inline] - fn $op_fn(self, other: &$rhs) -> Self::Output { - (*self).$op_fn(*other) - } - } - }; -} - -macro_rules! impl_fast_ops { - ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => { - $( - impl $op_trait <$fast_ty> for $fast_ty { - type Output = $fast_ty; - - #[inline(always)] - fn $op_fn(self, other: $fast_ty) -> Self::Output { - // Safety: - // - // - encountering poison operands is safe because LLVM's fast ops documents not producing - // UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these - // are then wrapped in the MaybePoison to control propagation - <$fast_ty>::new(unsafe { - $op_impl( - self.0.maybe_poison(), - other.0.maybe_poison(), - ) - }) - } - } - - impl $op_trait <$base_ty> for $fast_ty { - type Output = $fast_ty; - - #[inline(always)] - fn $op_fn(self, other: $base_ty) -> Self::Output { - self.$op_fn(<$fast_ty>::new(other)) - } - } - - impl $op_trait <$fast_ty> for $base_ty { - type Output = $fast_ty; - - #[inline(always)] - fn $op_fn(self, other: $fast_ty) -> Self::Output { - <$fast_ty>::new(self).$op_fn(other) - } - } - - impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn } - impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn } - impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn } - )* - }; -} - macro_rules! impl_assign_ops { ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op:ident,)*) => { $( @@ -375,7 +295,7 @@ macro_rules! impls { pub fn round(self) -> Self; pub fn sin(self) -> Self; pub fn sinh(self) -> Self; - pub fn sqrt(self) -> Self; + //pub fn sqrt(self) -> Self; pub fn tan(self) -> Self; pub fn tanh(self) -> Self; pub fn to_degrees(self) -> Self; @@ -442,15 +362,6 @@ macro_rules! impls { fmt::Debug, fmt::Display, fmt::LowerExp, fmt::UpperExp, } - impl_fast_ops! { - $fast_ty, $base_ty: - Add, add, fadd_fast, - Sub, sub, fsub_fast, - Mul, mul, fmul_fast, - Div, div, fdiv_fast, - Rem, rem, frem_fast, - } - impl_assign_ops! { $fast_ty, $base_ty: AddAssign, add_assign, add, diff --git a/src/math/math.c b/src/math/math.c index 8eec462..db6f9e5 100644 --- a/src/math/math.c +++ b/src/math/math.c @@ -2,7 +2,6 @@ #include #define IMPL_OPERATIONS(C_TYPE, RUST_TYPE) \ - /* TODO figure out why these don't inline */ \ __attribute__((always_inline)) \ C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \ return a + b; \ diff --git a/src/math/mod.rs b/src/math/mod.rs index e5f7553..8cef6bf 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -1,4 +1,5 @@ use crate::{poison::MaybePoison, FF32, FF64}; +use core::ops::{Add, Div, Mul, Rem, Sub}; use paste::paste; impl FF32 { @@ -75,12 +76,95 @@ macro_rules! impl_generic_math { }; } +macro_rules! impl_binary_refs { + ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => { + impl $op_trait<$rhs> for &$lhs { + type Output = <$lhs as $op_trait<$rhs>>::Output; + + #[inline] + fn $op_fn(self, other: $rhs) -> Self::Output { + (*self).$op_fn(other) + } + } + impl $op_trait<&$rhs> for $lhs { + type Output = <$lhs as $op_trait<$rhs>>::Output; + + #[inline] + fn $op_fn(self, other: &$rhs) -> Self::Output { + self.$op_fn(*other) + } + } + impl $op_trait<&$rhs> for &$lhs { + type Output = <$lhs as $op_trait<$rhs>>::Output; + + #[inline] + fn $op_fn(self, other: &$rhs) -> Self::Output { + (*self).$op_fn(*other) + } + } + }; +} + +macro_rules! impl_fast_ops { + ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => { + $( + impl $op_trait <$fast_ty> for $fast_ty { + type Output = $fast_ty; + + #[inline(always)] + fn $op_fn(self, other: $fast_ty) -> Self::Output { + unsafe { $op_impl(self, other) } + } + } + + impl $op_trait <$base_ty> for $fast_ty { + type Output = $fast_ty; + + #[inline(always)] + fn $op_fn(self, other: $base_ty) -> Self::Output { + self.$op_fn(<$fast_ty>::new(other)) + } + } + + impl $op_trait <$fast_ty> for $base_ty { + type Output = $fast_ty; + + #[inline(always)] + fn $op_fn(self, other: $fast_ty) -> Self::Output { + <$fast_ty>::new(self).$op_fn(other) + } + } + + impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn } + impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn } + impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn } + )* + }; +} + macro_rules! impl_extern_math { ($fast_ty:ident, $base_ty:ident) => { paste! { extern "C" { + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; fn [](a: $fast_ty, b: $fast_ty) -> $fast_ty; + + fn [](a: $fast_ty) -> $fast_ty; + } + + impl_fast_ops! { + $fast_ty, $base_ty: + Add, add, [], + Sub, sub, [], + Mul, mul, [], + Div, div, [], + Rem, rem, [], } impl $fast_ty { @@ -93,6 +177,11 @@ macro_rules! impl_extern_math { pub fn min(self, other: Self) -> Self { unsafe { [](self, other) } } + + #[inline] + pub fn sqrt(self) -> Self { + unsafe { [](self) } + } } } };