extern arithmetic

2021-11-23 16:52:04 -08:00
parent 43dc1419a8
commit 5a5289f43e
6 changed files with 260 additions and 152 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "fast_fp"
 version = "0.1.0"
 authors = ["Renar Narubin <renar@standard.ai>"]
-edition = "2018"
+edition = "2021"
 readme = "README.md"
 license = "MIT OR Apache-2.0"
@@ -18,8 +18,30 @@ name = "math"
 harness = false
 [features]
-default = ["num-traits"]
+default = [
  "num-traits",
  "finite-math-only",
  "associative-math",
  "reciprocal-math",
  "no-signed-zeros",
  "no-trapping-math",
  "fp-contract-fast",
  "approx-func",
 ]
 # default fast-math features
 finite-math-only = []
 associative-math = []
 reciprocal-math = []
 no-signed-zeros = []
 no-trapping-math = []
 fp-contract-fast = []
 approx-func = []
 # non-default fast-math-like features
 denormal-fp-math-preserve-sign = []
 # optional trait implementations
 nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
 nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
@@ -48,5 +70,5 @@ rand = "0.8"
 opt-level = 3
 [profile.release]
-lto = "fat"
+lto="thin"
-codegen-units = 1
+codegen-units=1
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -1,54 +1,122 @@
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use criterion::{
    criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup,
    BenchmarkId, Criterion, Throughput,
 };
 use fast_fp::{ff32, ff64, FF32, FF64};
-use rand::{distributions::Standard, thread_rng, Rng};
+use rand::{
    distributions::{self, Distribution},
    rngs::StdRng,
    Rng, SeedableRng,
 };
 use std::ops::{Add, Div, Mul};
-fn sum(c: &mut Criterion) {
+fn add(c: &mut Criterion) {
-    let mut group = c.benchmark_group("sum");
+    let mut group = c.benchmark_group("add");
    for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
        group.throughput(Throughput::Elements(count as u64));
-        let f32_vals = thread_rng()
+    let rng = StdRng::from_entropy();
-            .sample_iter(Standard)
+    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
-            .take(count)
+    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
            .collect::<Vec<f32>>();
-        // use the same values for both benchmarks
+    // clone the rng for each benched type to keep the generated values identical
-        let ff32_vals = f32_vals
+    fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
-            .clone()
+    fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
-            .into_iter()
+    fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
-            .map(ff32)
+    fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
            .collect::<Vec<FF32>>();
        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
        });
        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
            b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
        });
        let f64_vals = thread_rng()
            .sample_iter(Standard)
            .take(count)
            .collect::<Vec<f64>>();
        // use the same values for both benchmarks
        let ff64_vals = f64_vals
            .clone()
            .into_iter()
            .map(ff64)
            .collect::<Vec<FF64>>();
        group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| {
            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
        });
        group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| {
            b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val));
        });
    }
    group.finish();
 }
-criterion_group!(benches, sum);
+fn mul(c: &mut Criterion) {
    let mut group = c.benchmark_group("mul");
    let rng = StdRng::from_entropy();
    // try to avoid subnormals/explosions by limiting the values near 1
    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
    // clone the rng for each benched type to keep the generated values identical
    fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
    fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
 }
 fn div(c: &mut Criterion) {
    let mut group = c.benchmark_group("div");
    let rng = StdRng::from_entropy();
    // try to avoid subnormals/explosions by limiting the values near 1
    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
    // clone the rng for each benched type to keep the generated values identical
    fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
    fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
 }
 fn min(c: &mut Criterion) {
    let mut group = c.benchmark_group("min");
    let rng = StdRng::from_entropy();
    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
    // clone the rng for each benched type to keep the generated values identical
    fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
    fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
    fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
    fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
 }
 fn fold<T, S>(
    group: &mut BenchmarkGroup<'_, impl Measurement>,
    id: &str,
    op: impl Fn(T, T) -> T + Copy,
    init: T,
    mut rng: impl Rng,
    vals: impl Distribution<S> + Copy,
 ) where
    T: From<S> + Copy,
 {
    fold_count([init; 1], group, id, op, init, &mut rng, vals);
    fold_count([init; 2], group, id, op, init, &mut rng, vals);
    fold_count([init; 4], group, id, op, init, &mut rng, vals);
    fold_count([init; 8], group, id, op, init, &mut rng, vals);
    fold_count([init; 64], group, id, op, init, &mut rng, vals);
    fold_count([init; 256], group, id, op, init, &mut rng, vals);
    fold_count([init; 1024], group, id, op, init, &mut rng, vals);
 }
 fn fold_count<T, S, const N: usize>(
    arr: [T; N],
    group: &mut BenchmarkGroup<'_, impl Measurement>,
    id: &str,
    op: impl Fn(T, T) -> T + Copy,
    init: T,
    mut rng: impl Rng,
    vals: impl Distribution<S> + Copy,
 ) where
    T: From<S> + Copy,
 {
    group.throughput(Throughput::Elements(N as u64));
    group.bench_function(BenchmarkId::new(id, N), |b| {
        b.iter_batched_ref(
            || {
                let mut inputs = arr;
                inputs
                    .iter_mut()
                    .zip((&mut rng).sample_iter(&vals))
                    .for_each(|(dst, val)| *dst = T::from(val));
                inputs
            },
            |vals| vals.iter().copied().fold(init, op),
            BatchSize::SmallInput,
        );
    });
 }
 criterion_group!(benches, add, mul, div, min);
 criterion_main!(benches);
--- a/build.rs
+++ b/build.rs
@@ -6,7 +6,7 @@ fn main() {
        builder.compiler("clang");
    }
-    builder.flag("-O3").flag("-flto=thin");
+    builder.flag("-flto=thin");
    build_ll(builder.clone());
    build_c(builder);
@@ -21,13 +21,32 @@ fn build_ll(mut builder: cc::Build) {
 }
 fn build_c(mut builder: cc::Build) {
-    builder
+    builder.flag("-O3");
-        .file("src/math/math.c")
+
-        .flag("-ffinite-math-only")
+    #[cfg(feature = "finite-math-only")]
-        .flag("-fassociative-math")
+    builder.flag("-ffinite-math-only");
-        .flag("-freciprocal-math")
+
-        .flag("-fno-signed-zeros")
+    #[cfg(feature = "associative-math")]
-        .flag("-fno-trapping-math")
+    builder.flag("-fassociative-math");
-        .flag("-ffp-contract=fast")
+
-        .compile("math")
+    #[cfg(feature = "reciprocal-math")]
    builder.flag("-freciprocal-math");
    #[cfg(feature = "no-signed-zeros")]
    builder.flag("-fno-signed-zeros");
    #[cfg(feature = "no-trapping-math")]
    builder.flag("-fno-trapping-math");
    #[cfg(feature = "fp-contract-fast")]
    builder.flag("-ffp-contract=fast");
    // TODO figure out if this works
    //#[cfg(feature = "approx-func")]
    //builder.flag("-Xclang -fapprox-func");
    #[cfg(feature = "denormal-fp-math-preserve-sign")]
    builder.flag("-fdenormal-fp-math=preserve-sign");
    builder.file("src/math/math.c").compile("math")
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,12 +1,8 @@
 #![doc = include_str!("../README.md")]
 #![feature(core_intrinsics)] // intrinsics for the fast math
 #![feature(asm)] // asm used to emulate freeze
 #![feature(doc_cfg)]
 #![feature(link_llvm_intrinsics)]
 use core::{
    cmp, fmt,
    intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast},
    iter::{Product, Sum},
    num::FpCategory,
    ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
@@ -137,82 +133,6 @@ pub fn ff64(f: f64) -> FF64 {
    FF64::new(f)
 }
 macro_rules! impl_binary_refs {
    ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
        impl $op_trait<$rhs> for &$lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: $rhs) -> Self::Output {
                (*self).$op_fn(other)
            }
        }
        impl $op_trait<&$rhs> for $lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: &$rhs) -> Self::Output {
                self.$op_fn(*other)
            }
        }
        impl $op_trait<&$rhs> for &$lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: &$rhs) -> Self::Output {
                (*self).$op_fn(*other)
            }
        }
    };
 }
 macro_rules! impl_fast_ops {
    ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
        $(
            impl $op_trait <$fast_ty> for $fast_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $fast_ty) -> Self::Output {
                    // Safety:
                    //
                    // - encountering poison operands is safe because LLVM's fast ops documents not producing
                    // UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
                    // are then wrapped in the MaybePoison to control propagation
                    <$fast_ty>::new(unsafe {
                        $op_impl(
                            self.0.maybe_poison(),
                            other.0.maybe_poison(),
                        )
                    })
                }
            }
            impl $op_trait <$base_ty> for $fast_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $base_ty) -> Self::Output {
                    self.$op_fn(<$fast_ty>::new(other))
                }
            }
            impl $op_trait <$fast_ty> for $base_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $fast_ty) -> Self::Output {
                    <$fast_ty>::new(self).$op_fn(other)
                }
            }
            impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
            impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
            impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
        )*
    };
 }
 macro_rules! impl_assign_ops {
    ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op:ident,)*) => {
        $(
@@ -375,7 +295,7 @@ macro_rules! impls {
                pub fn round(self) -> Self;
                pub fn sin(self) -> Self;
                pub fn sinh(self) -> Self;
-                pub fn sqrt(self) -> Self;
+                //pub fn sqrt(self) -> Self;
                pub fn tan(self) -> Self;
                pub fn tanh(self) -> Self;
                pub fn to_degrees(self) -> Self;
@@ -442,15 +362,6 @@ macro_rules! impls {
            fmt::Debug, fmt::Display, fmt::LowerExp, fmt::UpperExp,
        }
        impl_fast_ops! {
            $fast_ty, $base_ty:
            Add, add, fadd_fast,
            Sub, sub, fsub_fast,
            Mul, mul, fmul_fast,
            Div, div, fdiv_fast,
            Rem, rem, frem_fast,
        }
        impl_assign_ops! {
            $fast_ty, $base_ty:
            AddAssign, add_assign, add,
--- a/src/math/math.c
+++ b/src/math/math.c
@@ -2,7 +2,6 @@
 #include <math.h>
 #define IMPL_OPERATIONS(C_TYPE, RUST_TYPE)       \
  /* TODO figure out why these don't inline */   \
  __attribute__((always_inline))                 \
  C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
    return a + b;                                \
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -1,4 +1,5 @@
 use crate::{poison::MaybePoison, FF32, FF64};
 use core::ops::{Add, Div, Mul, Rem, Sub};
 use paste::paste;
 impl FF32 {
@@ -75,12 +76,95 @@ macro_rules! impl_generic_math {
    };
 }
 macro_rules! impl_binary_refs {
    ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
        impl $op_trait<$rhs> for &$lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: $rhs) -> Self::Output {
                (*self).$op_fn(other)
            }
        }
        impl $op_trait<&$rhs> for $lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: &$rhs) -> Self::Output {
                self.$op_fn(*other)
            }
        }
        impl $op_trait<&$rhs> for &$lhs {
            type Output = <$lhs as $op_trait<$rhs>>::Output;
            #[inline]
            fn $op_fn(self, other: &$rhs) -> Self::Output {
                (*self).$op_fn(*other)
            }
        }
    };
 }
 macro_rules! impl_fast_ops {
    ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
        $(
            impl $op_trait <$fast_ty> for $fast_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $fast_ty) -> Self::Output {
                    unsafe { $op_impl(self, other) }
                }
            }
            impl $op_trait <$base_ty> for $fast_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $base_ty) -> Self::Output {
                    self.$op_fn(<$fast_ty>::new(other))
                }
            }
            impl $op_trait <$fast_ty> for $base_ty {
                type Output = $fast_ty;
                #[inline(always)]
                fn $op_fn(self, other: $fast_ty) -> Self::Output {
                    <$fast_ty>::new(self).$op_fn(other)
                }
            }
            impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
            impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
            impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
        )*
    };
 }
 macro_rules! impl_extern_math {
    ($fast_ty:ident, $base_ty:ident) => {
        paste! {
            extern "C" {
                fn [<add_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<sub_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<mul_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<div_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<rem_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<min_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<max_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                fn [<sqrt_ $base_ty>](a: $fast_ty) -> $fast_ty;
            }
            impl_fast_ops! {
                $fast_ty, $base_ty:
                Add, add, [<add_ $base_ty>],
                Sub, sub, [<sub_ $base_ty>],
                Mul, mul, [<mul_ $base_ty>],
                Div, div, [<div_ $base_ty>],
                Rem, rem, [<rem_ $base_ty>],
            }
            impl $fast_ty {
@@ -93,6 +177,11 @@ macro_rules! impl_extern_math {
                pub fn min(self, other: Self) -> Self {
                    unsafe { [<min_ $base_ty>](self, other) }
                }
                #[inline]
                pub fn sqrt(self) -> Self {
                    unsafe { [<sqrt_ $base_ty>](self) }
                }
            }
        }
    };