extern arithmetic

This commit is contained in:
Renar Narubin
2021-11-23 16:52:04 -08:00
parent 43dc1419a8
commit 5a5289f43e
6 changed files with 260 additions and 152 deletions

View File

@@ -2,7 +2,7 @@
name = "fast_fp"
version = "0.1.0"
authors = ["Renar Narubin <renar@standard.ai>"]
edition = "2018"
edition = "2021"
readme = "README.md"
license = "MIT OR Apache-2.0"
@@ -18,8 +18,30 @@ name = "math"
harness = false
[features]
default = ["num-traits"]
default = [
"num-traits",
"finite-math-only",
"associative-math",
"reciprocal-math",
"no-signed-zeros",
"no-trapping-math",
"fp-contract-fast",
"approx-func",
]
# default fast-math features
finite-math-only = []
associative-math = []
reciprocal-math = []
no-signed-zeros = []
no-trapping-math = []
fp-contract-fast = []
approx-func = []
# non-default fast-math-like features
denormal-fp-math-preserve-sign = []
# optional trait implementations
nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
@@ -48,5 +70,5 @@ rand = "0.8"
opt-level = 3
[profile.release]
lto = "fat"
codegen-units = 1
lto="thin"
codegen-units=1

View File

@@ -1,54 +1,122 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use criterion::{
criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup,
BenchmarkId, Criterion, Throughput,
};
use fast_fp::{ff32, ff64, FF32, FF64};
use rand::{distributions::Standard, thread_rng, Rng};
use rand::{
distributions::{self, Distribution},
rngs::StdRng,
Rng, SeedableRng,
};
use std::ops::{Add, Div, Mul};
fn sum(c: &mut Criterion) {
let mut group = c.benchmark_group("sum");
for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
group.throughput(Throughput::Elements(count as u64));
fn add(c: &mut Criterion) {
let mut group = c.benchmark_group("add");
let f32_vals = thread_rng()
.sample_iter(Standard)
.take(count)
.collect::<Vec<f32>>();
let rng = StdRng::from_entropy();
let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
// use the same values for both benchmarks
let ff32_vals = f32_vals
.clone()
.into_iter()
.map(ff32)
.collect::<Vec<FF32>>();
group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
});
group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
});
let f64_vals = thread_rng()
.sample_iter(Standard)
.take(count)
.collect::<Vec<f64>>();
// use the same values for both benchmarks
let ff64_vals = f64_vals
.clone()
.into_iter()
.map(ff64)
.collect::<Vec<FF64>>();
group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| {
b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
});
group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| {
b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val));
});
}
group.finish();
// clone the rng for each benched type to keep the generated values identical
fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
}
criterion_group!(benches, sum);
fn mul(c: &mut Criterion) {
let mut group = c.benchmark_group("mul");
let rng = StdRng::from_entropy();
// try to avoid subnormals/explosions by limiting the values near 1
let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
// clone the rng for each benched type to keep the generated values identical
fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
}
fn div(c: &mut Criterion) {
let mut group = c.benchmark_group("div");
let rng = StdRng::from_entropy();
// try to avoid subnormals/explosions by limiting the values near 1
let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
// clone the rng for each benched type to keep the generated values identical
fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
}
fn min(c: &mut Criterion) {
let mut group = c.benchmark_group("min");
let rng = StdRng::from_entropy();
let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
// clone the rng for each benched type to keep the generated values identical
fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
}
fn fold<T, S>(
group: &mut BenchmarkGroup<'_, impl Measurement>,
id: &str,
op: impl Fn(T, T) -> T + Copy,
init: T,
mut rng: impl Rng,
vals: impl Distribution<S> + Copy,
) where
T: From<S> + Copy,
{
fold_count([init; 1], group, id, op, init, &mut rng, vals);
fold_count([init; 2], group, id, op, init, &mut rng, vals);
fold_count([init; 4], group, id, op, init, &mut rng, vals);
fold_count([init; 8], group, id, op, init, &mut rng, vals);
fold_count([init; 64], group, id, op, init, &mut rng, vals);
fold_count([init; 256], group, id, op, init, &mut rng, vals);
fold_count([init; 1024], group, id, op, init, &mut rng, vals);
}
fn fold_count<T, S, const N: usize>(
arr: [T; N],
group: &mut BenchmarkGroup<'_, impl Measurement>,
id: &str,
op: impl Fn(T, T) -> T + Copy,
init: T,
mut rng: impl Rng,
vals: impl Distribution<S> + Copy,
) where
T: From<S> + Copy,
{
group.throughput(Throughput::Elements(N as u64));
group.bench_function(BenchmarkId::new(id, N), |b| {
b.iter_batched_ref(
|| {
let mut inputs = arr;
inputs
.iter_mut()
.zip((&mut rng).sample_iter(&vals))
.for_each(|(dst, val)| *dst = T::from(val));
inputs
},
|vals| vals.iter().copied().fold(init, op),
BatchSize::SmallInput,
);
});
}
criterion_group!(benches, add, mul, div, min);
criterion_main!(benches);

View File

@@ -6,7 +6,7 @@ fn main() {
builder.compiler("clang");
}
builder.flag("-O3").flag("-flto=thin");
builder.flag("-flto=thin");
build_ll(builder.clone());
build_c(builder);
@@ -21,13 +21,32 @@ fn build_ll(mut builder: cc::Build) {
}
fn build_c(mut builder: cc::Build) {
builder
.file("src/math/math.c")
.flag("-ffinite-math-only")
.flag("-fassociative-math")
.flag("-freciprocal-math")
.flag("-fno-signed-zeros")
.flag("-fno-trapping-math")
.flag("-ffp-contract=fast")
.compile("math")
builder.flag("-O3");
#[cfg(feature = "finite-math-only")]
builder.flag("-ffinite-math-only");
#[cfg(feature = "associative-math")]
builder.flag("-fassociative-math");
#[cfg(feature = "reciprocal-math")]
builder.flag("-freciprocal-math");
#[cfg(feature = "no-signed-zeros")]
builder.flag("-fno-signed-zeros");
#[cfg(feature = "no-trapping-math")]
builder.flag("-fno-trapping-math");
#[cfg(feature = "fp-contract-fast")]
builder.flag("-ffp-contract=fast");
// TODO figure out if this works
//#[cfg(feature = "approx-func")]
//builder.flag("-Xclang -fapprox-func");
#[cfg(feature = "denormal-fp-math-preserve-sign")]
builder.flag("-fdenormal-fp-math=preserve-sign");
builder.file("src/math/math.c").compile("math")
}

View File

@@ -1,12 +1,8 @@
#![doc = include_str!("../README.md")]
#![feature(core_intrinsics)] // intrinsics for the fast math
#![feature(asm)] // asm used to emulate freeze
#![feature(doc_cfg)]
#![feature(link_llvm_intrinsics)]
use core::{
cmp, fmt,
intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast},
iter::{Product, Sum},
num::FpCategory,
ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
@@ -137,82 +133,6 @@ pub fn ff64(f: f64) -> FF64 {
FF64::new(f)
}
macro_rules! impl_binary_refs {
($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
impl $op_trait<$rhs> for &$lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: $rhs) -> Self::Output {
(*self).$op_fn(other)
}
}
impl $op_trait<&$rhs> for $lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: &$rhs) -> Self::Output {
self.$op_fn(*other)
}
}
impl $op_trait<&$rhs> for &$lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: &$rhs) -> Self::Output {
(*self).$op_fn(*other)
}
}
};
}
macro_rules! impl_fast_ops {
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
$(
impl $op_trait <$fast_ty> for $fast_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $fast_ty) -> Self::Output {
// Safety:
//
// - encountering poison operands is safe because LLVM's fast ops documents not producing
// UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
// are then wrapped in the MaybePoison to control propagation
<$fast_ty>::new(unsafe {
$op_impl(
self.0.maybe_poison(),
other.0.maybe_poison(),
)
})
}
}
impl $op_trait <$base_ty> for $fast_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $base_ty) -> Self::Output {
self.$op_fn(<$fast_ty>::new(other))
}
}
impl $op_trait <$fast_ty> for $base_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $fast_ty) -> Self::Output {
<$fast_ty>::new(self).$op_fn(other)
}
}
impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
)*
};
}
macro_rules! impl_assign_ops {
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op:ident,)*) => {
$(
@@ -375,7 +295,7 @@ macro_rules! impls {
pub fn round(self) -> Self;
pub fn sin(self) -> Self;
pub fn sinh(self) -> Self;
pub fn sqrt(self) -> Self;
//pub fn sqrt(self) -> Self;
pub fn tan(self) -> Self;
pub fn tanh(self) -> Self;
pub fn to_degrees(self) -> Self;
@@ -442,15 +362,6 @@ macro_rules! impls {
fmt::Debug, fmt::Display, fmt::LowerExp, fmt::UpperExp,
}
impl_fast_ops! {
$fast_ty, $base_ty:
Add, add, fadd_fast,
Sub, sub, fsub_fast,
Mul, mul, fmul_fast,
Div, div, fdiv_fast,
Rem, rem, frem_fast,
}
impl_assign_ops! {
$fast_ty, $base_ty:
AddAssign, add_assign, add,

View File

@@ -2,7 +2,6 @@
#include <math.h>
#define IMPL_OPERATIONS(C_TYPE, RUST_TYPE) \
/* TODO figure out why these don't inline */ \
__attribute__((always_inline)) \
C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
return a + b; \

View File

@@ -1,4 +1,5 @@
use crate::{poison::MaybePoison, FF32, FF64};
use core::ops::{Add, Div, Mul, Rem, Sub};
use paste::paste;
impl FF32 {
@@ -75,12 +76,95 @@ macro_rules! impl_generic_math {
};
}
macro_rules! impl_binary_refs {
($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
impl $op_trait<$rhs> for &$lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: $rhs) -> Self::Output {
(*self).$op_fn(other)
}
}
impl $op_trait<&$rhs> for $lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: &$rhs) -> Self::Output {
self.$op_fn(*other)
}
}
impl $op_trait<&$rhs> for &$lhs {
type Output = <$lhs as $op_trait<$rhs>>::Output;
#[inline]
fn $op_fn(self, other: &$rhs) -> Self::Output {
(*self).$op_fn(*other)
}
}
};
}
macro_rules! impl_fast_ops {
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
$(
impl $op_trait <$fast_ty> for $fast_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $fast_ty) -> Self::Output {
unsafe { $op_impl(self, other) }
}
}
impl $op_trait <$base_ty> for $fast_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $base_ty) -> Self::Output {
self.$op_fn(<$fast_ty>::new(other))
}
}
impl $op_trait <$fast_ty> for $base_ty {
type Output = $fast_ty;
#[inline(always)]
fn $op_fn(self, other: $fast_ty) -> Self::Output {
<$fast_ty>::new(self).$op_fn(other)
}
}
impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
)*
};
}
macro_rules! impl_extern_math {
($fast_ty:ident, $base_ty:ident) => {
paste! {
extern "C" {
fn [<add_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<sub_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<mul_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<div_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<rem_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<min_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<max_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
fn [<sqrt_ $base_ty>](a: $fast_ty) -> $fast_ty;
}
impl_fast_ops! {
$fast_ty, $base_ty:
Add, add, [<add_ $base_ty>],
Sub, sub, [<sub_ $base_ty>],
Mul, mul, [<mul_ $base_ty>],
Div, div, [<div_ $base_ty>],
Rem, rem, [<rem_ $base_ty>],
}
impl $fast_ty {
@@ -93,6 +177,11 @@ macro_rules! impl_extern_math {
pub fn min(self, other: Self) -> Self {
unsafe { [<min_ $base_ty>](self, other) }
}
#[inline]
pub fn sqrt(self) -> Self {
unsafe { [<sqrt_ $base_ty>](self) }
}
}
}
};