extern arithmetic
This commit is contained in:
30
Cargo.toml
30
Cargo.toml
@@ -2,7 +2,7 @@
|
||||
name = "fast_fp"
|
||||
version = "0.1.0"
|
||||
authors = ["Renar Narubin <renar@standard.ai>"]
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
readme = "README.md"
|
||||
license = "MIT OR Apache-2.0"
|
||||
|
||||
@@ -18,8 +18,30 @@ name = "math"
|
||||
harness = false
|
||||
|
||||
[features]
|
||||
default = ["num-traits"]
|
||||
default = [
|
||||
"num-traits",
|
||||
"finite-math-only",
|
||||
"associative-math",
|
||||
"reciprocal-math",
|
||||
"no-signed-zeros",
|
||||
"no-trapping-math",
|
||||
"fp-contract-fast",
|
||||
"approx-func",
|
||||
]
|
||||
|
||||
# default fast-math features
|
||||
finite-math-only = []
|
||||
associative-math = []
|
||||
reciprocal-math = []
|
||||
no-signed-zeros = []
|
||||
no-trapping-math = []
|
||||
fp-contract-fast = []
|
||||
approx-func = []
|
||||
|
||||
# non-default fast-math-like features
|
||||
denormal-fp-math-preserve-sign = []
|
||||
|
||||
# optional trait implementations
|
||||
nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
|
||||
nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
|
||||
|
||||
@@ -48,5 +70,5 @@ rand = "0.8"
|
||||
opt-level = 3
|
||||
|
||||
[profile.release]
|
||||
lto = "fat"
|
||||
codegen-units = 1
|
||||
lto="thin"
|
||||
codegen-units=1
|
||||
|
||||
@@ -1,54 +1,122 @@
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use criterion::{
|
||||
criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup,
|
||||
BenchmarkId, Criterion, Throughput,
|
||||
};
|
||||
use fast_fp::{ff32, ff64, FF32, FF64};
|
||||
use rand::{distributions::Standard, thread_rng, Rng};
|
||||
use rand::{
|
||||
distributions::{self, Distribution},
|
||||
rngs::StdRng,
|
||||
Rng, SeedableRng,
|
||||
};
|
||||
use std::ops::{Add, Div, Mul};
|
||||
|
||||
fn sum(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sum");
|
||||
for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
|
||||
group.throughput(Throughput::Elements(count as u64));
|
||||
fn add(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("add");
|
||||
|
||||
let f32_vals = thread_rng()
|
||||
.sample_iter(Standard)
|
||||
.take(count)
|
||||
.collect::<Vec<f32>>();
|
||||
let rng = StdRng::from_entropy();
|
||||
let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
|
||||
let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
|
||||
|
||||
// use the same values for both benchmarks
|
||||
let ff32_vals = f32_vals
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(ff32)
|
||||
.collect::<Vec<FF32>>();
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
|
||||
});
|
||||
|
||||
let f64_vals = thread_rng()
|
||||
.sample_iter(Standard)
|
||||
.take(count)
|
||||
.collect::<Vec<f64>>();
|
||||
|
||||
// use the same values for both benchmarks
|
||||
let ff64_vals = f64_vals
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(ff64)
|
||||
.collect::<Vec<FF64>>();
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val));
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
// clone the rng for each benched type to keep the generated values identical
|
||||
fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
|
||||
fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
|
||||
fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
|
||||
fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
|
||||
}
|
||||
|
||||
criterion_group!(benches, sum);
|
||||
fn mul(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("mul");
|
||||
|
||||
let rng = StdRng::from_entropy();
|
||||
|
||||
// try to avoid subnormals/explosions by limiting the values near 1
|
||||
let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
|
||||
let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
|
||||
|
||||
// clone the rng for each benched type to keep the generated values identical
|
||||
fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
|
||||
fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
|
||||
fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
|
||||
fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
|
||||
}
|
||||
|
||||
fn div(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("div");
|
||||
|
||||
let rng = StdRng::from_entropy();
|
||||
|
||||
// try to avoid subnormals/explosions by limiting the values near 1
|
||||
let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
|
||||
let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
|
||||
|
||||
// clone the rng for each benched type to keep the generated values identical
|
||||
fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
|
||||
fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
|
||||
fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
|
||||
fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
|
||||
}
|
||||
|
||||
fn min(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("min");
|
||||
|
||||
let rng = StdRng::from_entropy();
|
||||
let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
|
||||
let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
|
||||
|
||||
// clone the rng for each benched type to keep the generated values identical
|
||||
fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
|
||||
fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
|
||||
fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
|
||||
fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
|
||||
}
|
||||
|
||||
fn fold<T, S>(
|
||||
group: &mut BenchmarkGroup<'_, impl Measurement>,
|
||||
id: &str,
|
||||
op: impl Fn(T, T) -> T + Copy,
|
||||
init: T,
|
||||
mut rng: impl Rng,
|
||||
vals: impl Distribution<S> + Copy,
|
||||
) where
|
||||
T: From<S> + Copy,
|
||||
{
|
||||
fold_count([init; 1], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 2], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 4], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 8], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 64], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 256], group, id, op, init, &mut rng, vals);
|
||||
fold_count([init; 1024], group, id, op, init, &mut rng, vals);
|
||||
}
|
||||
|
||||
fn fold_count<T, S, const N: usize>(
|
||||
arr: [T; N],
|
||||
group: &mut BenchmarkGroup<'_, impl Measurement>,
|
||||
id: &str,
|
||||
op: impl Fn(T, T) -> T + Copy,
|
||||
init: T,
|
||||
mut rng: impl Rng,
|
||||
vals: impl Distribution<S> + Copy,
|
||||
) where
|
||||
T: From<S> + Copy,
|
||||
{
|
||||
group.throughput(Throughput::Elements(N as u64));
|
||||
|
||||
group.bench_function(BenchmarkId::new(id, N), |b| {
|
||||
b.iter_batched_ref(
|
||||
|| {
|
||||
let mut inputs = arr;
|
||||
inputs
|
||||
.iter_mut()
|
||||
.zip((&mut rng).sample_iter(&vals))
|
||||
.for_each(|(dst, val)| *dst = T::from(val));
|
||||
inputs
|
||||
},
|
||||
|vals| vals.iter().copied().fold(init, op),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, add, mul, div, min);
|
||||
criterion_main!(benches);
|
||||
|
||||
39
build.rs
39
build.rs
@@ -6,7 +6,7 @@ fn main() {
|
||||
builder.compiler("clang");
|
||||
}
|
||||
|
||||
builder.flag("-O3").flag("-flto=thin");
|
||||
builder.flag("-flto=thin");
|
||||
|
||||
build_ll(builder.clone());
|
||||
build_c(builder);
|
||||
@@ -21,13 +21,32 @@ fn build_ll(mut builder: cc::Build) {
|
||||
}
|
||||
|
||||
fn build_c(mut builder: cc::Build) {
|
||||
builder
|
||||
.file("src/math/math.c")
|
||||
.flag("-ffinite-math-only")
|
||||
.flag("-fassociative-math")
|
||||
.flag("-freciprocal-math")
|
||||
.flag("-fno-signed-zeros")
|
||||
.flag("-fno-trapping-math")
|
||||
.flag("-ffp-contract=fast")
|
||||
.compile("math")
|
||||
builder.flag("-O3");
|
||||
|
||||
#[cfg(feature = "finite-math-only")]
|
||||
builder.flag("-ffinite-math-only");
|
||||
|
||||
#[cfg(feature = "associative-math")]
|
||||
builder.flag("-fassociative-math");
|
||||
|
||||
#[cfg(feature = "reciprocal-math")]
|
||||
builder.flag("-freciprocal-math");
|
||||
|
||||
#[cfg(feature = "no-signed-zeros")]
|
||||
builder.flag("-fno-signed-zeros");
|
||||
|
||||
#[cfg(feature = "no-trapping-math")]
|
||||
builder.flag("-fno-trapping-math");
|
||||
|
||||
#[cfg(feature = "fp-contract-fast")]
|
||||
builder.flag("-ffp-contract=fast");
|
||||
|
||||
// TODO figure out if this works
|
||||
//#[cfg(feature = "approx-func")]
|
||||
//builder.flag("-Xclang -fapprox-func");
|
||||
|
||||
#[cfg(feature = "denormal-fp-math-preserve-sign")]
|
||||
builder.flag("-fdenormal-fp-math=preserve-sign");
|
||||
|
||||
builder.file("src/math/math.c").compile("math")
|
||||
}
|
||||
|
||||
91
src/lib.rs
91
src/lib.rs
@@ -1,12 +1,8 @@
|
||||
#![doc = include_str!("../README.md")]
|
||||
#![feature(core_intrinsics)] // intrinsics for the fast math
|
||||
#![feature(asm)] // asm used to emulate freeze
|
||||
#![feature(doc_cfg)]
|
||||
#![feature(link_llvm_intrinsics)]
|
||||
|
||||
use core::{
|
||||
cmp, fmt,
|
||||
intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast},
|
||||
iter::{Product, Sum},
|
||||
num::FpCategory,
|
||||
ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
|
||||
@@ -137,82 +133,6 @@ pub fn ff64(f: f64) -> FF64 {
|
||||
FF64::new(f)
|
||||
}
|
||||
|
||||
macro_rules! impl_binary_refs {
|
||||
($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
|
||||
impl $op_trait<$rhs> for &$lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: $rhs) -> Self::Output {
|
||||
(*self).$op_fn(other)
|
||||
}
|
||||
}
|
||||
impl $op_trait<&$rhs> for $lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: &$rhs) -> Self::Output {
|
||||
self.$op_fn(*other)
|
||||
}
|
||||
}
|
||||
impl $op_trait<&$rhs> for &$lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: &$rhs) -> Self::Output {
|
||||
(*self).$op_fn(*other)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_fast_ops {
|
||||
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
|
||||
$(
|
||||
impl $op_trait <$fast_ty> for $fast_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $fast_ty) -> Self::Output {
|
||||
// Safety:
|
||||
//
|
||||
// - encountering poison operands is safe because LLVM's fast ops documents not producing
|
||||
// UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
|
||||
// are then wrapped in the MaybePoison to control propagation
|
||||
<$fast_ty>::new(unsafe {
|
||||
$op_impl(
|
||||
self.0.maybe_poison(),
|
||||
other.0.maybe_poison(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl $op_trait <$base_ty> for $fast_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $base_ty) -> Self::Output {
|
||||
self.$op_fn(<$fast_ty>::new(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl $op_trait <$fast_ty> for $base_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $fast_ty) -> Self::Output {
|
||||
<$fast_ty>::new(self).$op_fn(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
|
||||
impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
|
||||
impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_assign_ops {
|
||||
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op:ident,)*) => {
|
||||
$(
|
||||
@@ -375,7 +295,7 @@ macro_rules! impls {
|
||||
pub fn round(self) -> Self;
|
||||
pub fn sin(self) -> Self;
|
||||
pub fn sinh(self) -> Self;
|
||||
pub fn sqrt(self) -> Self;
|
||||
//pub fn sqrt(self) -> Self;
|
||||
pub fn tan(self) -> Self;
|
||||
pub fn tanh(self) -> Self;
|
||||
pub fn to_degrees(self) -> Self;
|
||||
@@ -442,15 +362,6 @@ macro_rules! impls {
|
||||
fmt::Debug, fmt::Display, fmt::LowerExp, fmt::UpperExp,
|
||||
}
|
||||
|
||||
impl_fast_ops! {
|
||||
$fast_ty, $base_ty:
|
||||
Add, add, fadd_fast,
|
||||
Sub, sub, fsub_fast,
|
||||
Mul, mul, fmul_fast,
|
||||
Div, div, fdiv_fast,
|
||||
Rem, rem, frem_fast,
|
||||
}
|
||||
|
||||
impl_assign_ops! {
|
||||
$fast_ty, $base_ty:
|
||||
AddAssign, add_assign, add,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#include <math.h>
|
||||
|
||||
#define IMPL_OPERATIONS(C_TYPE, RUST_TYPE) \
|
||||
/* TODO figure out why these don't inline */ \
|
||||
__attribute__((always_inline)) \
|
||||
C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
|
||||
return a + b; \
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{poison::MaybePoison, FF32, FF64};
|
||||
use core::ops::{Add, Div, Mul, Rem, Sub};
|
||||
use paste::paste;
|
||||
|
||||
impl FF32 {
|
||||
@@ -75,12 +76,95 @@ macro_rules! impl_generic_math {
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_binary_refs {
|
||||
($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
|
||||
impl $op_trait<$rhs> for &$lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: $rhs) -> Self::Output {
|
||||
(*self).$op_fn(other)
|
||||
}
|
||||
}
|
||||
impl $op_trait<&$rhs> for $lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: &$rhs) -> Self::Output {
|
||||
self.$op_fn(*other)
|
||||
}
|
||||
}
|
||||
impl $op_trait<&$rhs> for &$lhs {
|
||||
type Output = <$lhs as $op_trait<$rhs>>::Output;
|
||||
|
||||
#[inline]
|
||||
fn $op_fn(self, other: &$rhs) -> Self::Output {
|
||||
(*self).$op_fn(*other)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_fast_ops {
|
||||
($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
|
||||
$(
|
||||
impl $op_trait <$fast_ty> for $fast_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $fast_ty) -> Self::Output {
|
||||
unsafe { $op_impl(self, other) }
|
||||
}
|
||||
}
|
||||
|
||||
impl $op_trait <$base_ty> for $fast_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $base_ty) -> Self::Output {
|
||||
self.$op_fn(<$fast_ty>::new(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl $op_trait <$fast_ty> for $base_ty {
|
||||
type Output = $fast_ty;
|
||||
|
||||
#[inline(always)]
|
||||
fn $op_fn(self, other: $fast_ty) -> Self::Output {
|
||||
<$fast_ty>::new(self).$op_fn(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
|
||||
impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
|
||||
impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! impl_extern_math {
|
||||
($fast_ty:ident, $base_ty:ident) => {
|
||||
paste! {
|
||||
extern "C" {
|
||||
fn [<add_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
fn [<sub_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
fn [<mul_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
fn [<div_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
fn [<rem_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
|
||||
fn [<min_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
fn [<max_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
|
||||
|
||||
fn [<sqrt_ $base_ty>](a: $fast_ty) -> $fast_ty;
|
||||
}
|
||||
|
||||
impl_fast_ops! {
|
||||
$fast_ty, $base_ty:
|
||||
Add, add, [<add_ $base_ty>],
|
||||
Sub, sub, [<sub_ $base_ty>],
|
||||
Mul, mul, [<mul_ $base_ty>],
|
||||
Div, div, [<div_ $base_ty>],
|
||||
Rem, rem, [<rem_ $base_ty>],
|
||||
}
|
||||
|
||||
impl $fast_ty {
|
||||
@@ -93,6 +177,11 @@ macro_rules! impl_extern_math {
|
||||
pub fn min(self, other: Self) -> Self {
|
||||
unsafe { [<min_ $base_ty>](self, other) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sqrt(self) -> Self {
|
||||
unsafe { [<sqrt_ $base_ty>](self) }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user