extern arithmetic

2021-11-23 16:52:04 -08:00
parent 43dc1419a8
commit 5a5289f43e
6 changed files with 260 additions and 152 deletions
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -1,54 +1,122 @@
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use criterion::{
+    criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup,
+    BenchmarkId, Criterion, Throughput,
+};
 use fast_fp::{ff32, ff64, FF32, FF64};
-use rand::{distributions::Standard, thread_rng, Rng};
+use rand::{
+    distributions::{self, Distribution},
+    rngs::StdRng,
+    Rng, SeedableRng,
+};
+use std::ops::{Add, Div, Mul};

-fn sum(c: &mut Criterion) {
-    let mut group = c.benchmark_group("sum");
-    for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
-        group.throughput(Throughput::Elements(count as u64));
+fn add(c: &mut Criterion) {
+    let mut group = c.benchmark_group("add");

-        let f32_vals = thread_rng()
-            .sample_iter(Standard)
-            .take(count)
-            .collect::<Vec<f32>>();
+    let rng = StdRng::from_entropy();
+    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
+    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);

-        // use the same values for both benchmarks
-        let ff32_vals = f32_vals
-            .clone()
-            .into_iter()
-            .map(ff32)
-            .collect::<Vec<FF32>>();
-
-        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
-        });
-
-        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
-        });
-
-        let f64_vals = thread_rng()
-            .sample_iter(Standard)
-            .take(count)
-            .collect::<Vec<f64>>();
-
-        // use the same values for both benchmarks
-        let ff64_vals = f64_vals
-            .clone()
-            .into_iter()
-            .map(ff64)
-            .collect::<Vec<FF64>>();
-
-        group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
-        });
-
-        group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val));
-        });
-    }
-    group.finish();
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
 }

-criterion_group!(benches, sum);
+fn mul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mul");
+
+    let rng = StdRng::from_entropy();
+
+    // try to avoid subnormals/explosions by limiting the values near 1
+    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
+    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
+}
+
+fn div(c: &mut Criterion) {
+    let mut group = c.benchmark_group("div");
+
+    let rng = StdRng::from_entropy();
+
+    // try to avoid subnormals/explosions by limiting the values near 1
+    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
+    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
+}
+
+fn min(c: &mut Criterion) {
+    let mut group = c.benchmark_group("min");
+
+    let rng = StdRng::from_entropy();
+    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
+    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
+}
+
+fn fold<T, S>(
+    group: &mut BenchmarkGroup<'_, impl Measurement>,
+    id: &str,
+    op: impl Fn(T, T) -> T + Copy,
+    init: T,
+    mut rng: impl Rng,
+    vals: impl Distribution<S> + Copy,
+) where
+    T: From<S> + Copy,
+{
+    fold_count([init; 1], group, id, op, init, &mut rng, vals);
+    fold_count([init; 2], group, id, op, init, &mut rng, vals);
+    fold_count([init; 4], group, id, op, init, &mut rng, vals);
+    fold_count([init; 8], group, id, op, init, &mut rng, vals);
+    fold_count([init; 64], group, id, op, init, &mut rng, vals);
+    fold_count([init; 256], group, id, op, init, &mut rng, vals);
+    fold_count([init; 1024], group, id, op, init, &mut rng, vals);
+}
+
+fn fold_count<T, S, const N: usize>(
+    arr: [T; N],
+    group: &mut BenchmarkGroup<'_, impl Measurement>,
+    id: &str,
+    op: impl Fn(T, T) -> T + Copy,
+    init: T,
+    mut rng: impl Rng,
+    vals: impl Distribution<S> + Copy,
+) where
+    T: From<S> + Copy,
+{
+    group.throughput(Throughput::Elements(N as u64));
+
+    group.bench_function(BenchmarkId::new(id, N), |b| {
+        b.iter_batched_ref(
+            || {
+                let mut inputs = arr;
+                inputs
+                    .iter_mut()
+                    .zip((&mut rng).sample_iter(&vals))
+                    .for_each(|(dst, val)| *dst = T::from(val));
+                inputs
+            },
+            |vals| vals.iter().copied().fold(init, op),
+            BatchSize::SmallInput,
+        );
+    });
+}
+
+criterion_group!(benches, add, mul, div, min);
 criterion_main!(benches);