diff --git a/Cargo.toml b/Cargo.toml
index 61e1e5a..927888b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "fast_fp"
 version = "0.1.0"
 authors = ["Renar Narubin <renar@standard.ai>"]
-edition = "2018"
+edition = "2021"
 readme = "README.md"
 license = "MIT OR Apache-2.0"
 
@@ -18,8 +18,30 @@ name = "math"
 harness = false
 
 [features]
-default = ["num-traits"]
+default = [
+  "num-traits",
+  "finite-math-only",
+  "associative-math",
+  "reciprocal-math",
+  "no-signed-zeros",
+  "no-trapping-math",
+  "fp-contract-fast",
+  "approx-func",
+]
 
+# default fast-math features
+finite-math-only = []
+associative-math = []
+reciprocal-math = []
+no-signed-zeros = []
+no-trapping-math = []
+fp-contract-fast = []
+approx-func = []
+
+# non-default fast-math-like features
+denormal-fp-math-preserve-sign = []
+
+# optional trait implementations
 nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
 nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
 
@@ -48,5 +70,5 @@ rand = "0.8"
 opt-level = 3
 
 [profile.release]
-lto = "fat"
-codegen-units = 1
+lto="thin"
+codegen-units=1
diff --git a/benches/operations.rs b/benches/operations.rs
index 34dd228..32b6b92 100644
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -1,54 +1,122 @@
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use criterion::{
+    criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkGroup,
+    BenchmarkId, Criterion, Throughput,
+};
 use fast_fp::{ff32, ff64, FF32, FF64};
-use rand::{distributions::Standard, thread_rng, Rng};
+use rand::{
+    distributions::{self, Distribution},
+    rngs::StdRng,
+    Rng, SeedableRng,
+};
+use std::ops::{Add, Div, Mul};
 
-fn sum(c: &mut Criterion) {
-    let mut group = c.benchmark_group("sum");
-    for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
-        group.throughput(Throughput::Elements(count as u64));
+fn add(c: &mut Criterion) {
+    let mut group = c.benchmark_group("add");
 
-        let f32_vals = thread_rng()
-            .sample_iter(Standard)
-            .take(count)
-            .collect::<Vec<f32>>();
+    let rng = StdRng::from_entropy();
+    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
+    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
 
-        // use the same values for both benchmarks
-        let ff32_vals = f32_vals
-            .clone()
-            .into_iter()
-            .map(ff32)
-            .collect::<Vec<FF32>>();
-
-        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
-        });
-
-        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
-        });
-
-        let f64_vals = thread_rng()
-            .sample_iter(Standard)
-            .take(count)
-            .collect::<Vec<f64>>();
-
-        // use the same values for both benchmarks
-        let ff64_vals = f64_vals
-            .clone()
-            .into_iter()
-            .map(ff64)
-            .collect::<Vec<FF64>>();
-
-        group.bench_with_input(BenchmarkId::new("std::f64", count), &f64_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
-        });
-
-        group.bench_with_input(BenchmarkId::new("FF64", count), &ff64_vals, |b, vals| {
-            b.iter(|| vals.iter().copied().fold(ff64(0.0), |acc, val| acc + val));
-        });
-    }
-    group.finish();
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::add, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::add, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::add, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::add, ff64(0.0), rng.clone(), f64s);
 }
 
-criterion_group!(benches, sum);
+fn mul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mul");
+
+    let rng = StdRng::from_entropy();
+
+    // try to avoid subnormals/explosions by limiting the values near 1
+    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
+    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::mul, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::mul, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::mul, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::mul, ff64(0.0), rng.clone(), f64s);
+}
+
+fn div(c: &mut Criterion) {
+    let mut group = c.benchmark_group("div");
+
+    let rng = StdRng::from_entropy();
+
+    // try to avoid subnormals/explosions by limiting the values near 1
+    let f32s = distributions::Uniform::<f32>::new(0.9, 1.1);
+    let f64s = distributions::Uniform::<f64>::new(0.9, 1.1);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::div, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::div, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::div, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::div, ff64(0.0), rng.clone(), f64s);
+}
+
+fn min(c: &mut Criterion) {
+    let mut group = c.benchmark_group("min");
+
+    let rng = StdRng::from_entropy();
+    let f32s = distributions::Uniform::<f32>::new(0.0, 1.0);
+    let f64s = distributions::Uniform::<f64>::new(0.0, 1.0);
+
+    // clone the rng for each benched type to keep the generated values identical
+    fold(&mut group, "std::f32", f32::min, 0.0, rng.clone(), f32s);
+    fold(&mut group, "FF32", FF32::min, ff32(0.0), rng.clone(), f32s);
+    fold(&mut group, "std::f64", f64::min, 0.0, rng.clone(), f64s);
+    fold(&mut group, "FF64", FF64::min, ff64(0.0), rng.clone(), f64s);
+}
+
+fn fold<T, S>(
+    group: &mut BenchmarkGroup<'_, impl Measurement>,
+    id: &str,
+    op: impl Fn(T, T) -> T + Copy,
+    init: T,
+    mut rng: impl Rng,
+    vals: impl Distribution<S> + Copy,
+) where
+    T: From<S> + Copy,
+{
+    fold_count([init; 1], group, id, op, init, &mut rng, vals);
+    fold_count([init; 2], group, id, op, init, &mut rng, vals);
+    fold_count([init; 4], group, id, op, init, &mut rng, vals);
+    fold_count([init; 8], group, id, op, init, &mut rng, vals);
+    fold_count([init; 64], group, id, op, init, &mut rng, vals);
+    fold_count([init; 256], group, id, op, init, &mut rng, vals);
+    fold_count([init; 1024], group, id, op, init, &mut rng, vals);
+}
+
+fn fold_count<T, S, const N: usize>(
+    arr: [T; N],
+    group: &mut BenchmarkGroup<'_, impl Measurement>,
+    id: &str,
+    op: impl Fn(T, T) -> T + Copy,
+    init: T,
+    mut rng: impl Rng,
+    vals: impl Distribution<S> + Copy,
+) where
+    T: From<S> + Copy,
+{
+    group.throughput(Throughput::Elements(N as u64));
+
+    group.bench_function(BenchmarkId::new(id, N), |b| {
+        b.iter_batched_ref(
+            || {
+                let mut inputs = arr;
+                inputs
+                    .iter_mut()
+                    .zip((&mut rng).sample_iter(&vals))
+                    .for_each(|(dst, val)| *dst = T::from(val));
+                inputs
+            },
+            |vals| vals.iter().copied().fold(init, op),
+            BatchSize::SmallInput,
+        );
+    });
+}
+
+criterion_group!(benches, add, mul, div, min);
 criterion_main!(benches);
diff --git a/build.rs b/build.rs
index 343fbf3..04eee83 100644
--- a/build.rs
+++ b/build.rs
@@ -6,7 +6,7 @@ fn main() {
         builder.compiler("clang");
     }
 
-    builder.flag("-O3").flag("-flto=thin");
+    builder.flag("-flto=thin");
 
     build_ll(builder.clone());
     build_c(builder);
@@ -21,13 +21,32 @@ fn build_ll(mut builder: cc::Build) {
 }
 
 fn build_c(mut builder: cc::Build) {
-    builder
-        .file("src/math/math.c")
-        .flag("-ffinite-math-only")
-        .flag("-fassociative-math")
-        .flag("-freciprocal-math")
-        .flag("-fno-signed-zeros")
-        .flag("-fno-trapping-math")
-        .flag("-ffp-contract=fast")
-        .compile("math")
+    builder.flag("-O3");
+
+    #[cfg(feature = "finite-math-only")]
+    builder.flag("-ffinite-math-only");
+
+    #[cfg(feature = "associative-math")]
+    builder.flag("-fassociative-math");
+
+    #[cfg(feature = "reciprocal-math")]
+    builder.flag("-freciprocal-math");
+
+    #[cfg(feature = "no-signed-zeros")]
+    builder.flag("-fno-signed-zeros");
+
+    #[cfg(feature = "no-trapping-math")]
+    builder.flag("-fno-trapping-math");
+
+    #[cfg(feature = "fp-contract-fast")]
+    builder.flag("-ffp-contract=fast");
+
+    // TODO figure out if this works
+    //#[cfg(feature = "approx-func")]
+    //builder.flag("-Xclang -fapprox-func");
+
+    #[cfg(feature = "denormal-fp-math-preserve-sign")]
+    builder.flag("-fdenormal-fp-math=preserve-sign");
+
+    builder.file("src/math/math.c").compile("math")
 }
diff --git a/src/lib.rs b/src/lib.rs
index 5c0dba2..cbbefff 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,12 +1,8 @@
 #![doc = include_str!("../README.md")]
-#![feature(core_intrinsics)] // intrinsics for the fast math
-#![feature(asm)] // asm used to emulate freeze
 #![feature(doc_cfg)]
-#![feature(link_llvm_intrinsics)]
 
 use core::{
     cmp, fmt,
-    intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast},
     iter::{Product, Sum},
     num::FpCategory,
     ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Rem, RemAssign, Sub, SubAssign},
@@ -137,82 +133,6 @@ pub fn ff64(f: f64) -> FF64 {
     FF64::new(f)
 }
 
-macro_rules! impl_binary_refs {
-    ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
-        impl $op_trait<$rhs> for &$lhs {
-            type Output = <$lhs as $op_trait<$rhs>>::Output;
-
-            #[inline]
-            fn $op_fn(self, other: $rhs) -> Self::Output {
-                (*self).$op_fn(other)
-            }
-        }
-        impl $op_trait<&$rhs> for $lhs {
-            type Output = <$lhs as $op_trait<$rhs>>::Output;
-
-            #[inline]
-            fn $op_fn(self, other: &$rhs) -> Self::Output {
-                self.$op_fn(*other)
-            }
-        }
-        impl $op_trait<&$rhs> for &$lhs {
-            type Output = <$lhs as $op_trait<$rhs>>::Output;
-
-            #[inline]
-            fn $op_fn(self, other: &$rhs) -> Self::Output {
-                (*self).$op_fn(*other)
-            }
-        }
-    };
-}
-
-macro_rules! impl_fast_ops {
-    ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
-        $(
-            impl $op_trait <$fast_ty> for $fast_ty {
-                type Output = $fast_ty;
-
-                #[inline(always)]
-                fn $op_fn(self, other: $fast_ty) -> Self::Output {
-                    // Safety:
-                    //
-                    // - encountering poison operands is safe because LLVM's fast ops documents not producing
-                    // UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
-                    // are then wrapped in the MaybePoison to control propagation
-                    <$fast_ty>::new(unsafe {
-                        $op_impl(
-                            self.0.maybe_poison(),
-                            other.0.maybe_poison(),
-                        )
-                    })
-                }
-            }
-
-            impl $op_trait <$base_ty> for $fast_ty {
-                type Output = $fast_ty;
-
-                #[inline(always)]
-                fn $op_fn(self, other: $base_ty) -> Self::Output {
-                    self.$op_fn(<$fast_ty>::new(other))
-                }
-            }
-
-            impl $op_trait <$fast_ty> for $base_ty {
-                type Output = $fast_ty;
-
-                #[inline(always)]
-                fn $op_fn(self, other: $fast_ty) -> Self::Output {
-                    <$fast_ty>::new(self).$op_fn(other)
-                }
-            }
-
-            impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
-            impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
-            impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
-        )*
-    };
-}
-
 macro_rules! impl_assign_ops {
     ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op:ident,)*) => {
         $(
@@ -375,7 +295,7 @@ macro_rules! impls {
                 pub fn round(self) -> Self;
                 pub fn sin(self) -> Self;
                 pub fn sinh(self) -> Self;
-                pub fn sqrt(self) -> Self;
+                //pub fn sqrt(self) -> Self;
                 pub fn tan(self) -> Self;
                 pub fn tanh(self) -> Self;
                 pub fn to_degrees(self) -> Self;
@@ -442,15 +362,6 @@ macro_rules! impls {
             fmt::Debug, fmt::Display, fmt::LowerExp, fmt::UpperExp,
         }
 
-        impl_fast_ops! {
-            $fast_ty, $base_ty:
-            Add, add, fadd_fast,
-            Sub, sub, fsub_fast,
-            Mul, mul, fmul_fast,
-            Div, div, fdiv_fast,
-            Rem, rem, frem_fast,
-        }
-
         impl_assign_ops! {
             $fast_ty, $base_ty:
             AddAssign, add_assign, add,
diff --git a/src/math/math.c b/src/math/math.c
index 8eec462..db6f9e5 100644
--- a/src/math/math.c
+++ b/src/math/math.c
@@ -2,7 +2,6 @@
 #include <math.h>
 
 #define IMPL_OPERATIONS(C_TYPE, RUST_TYPE)       \
-  /* TODO figure out why these don't inline */   \
   __attribute__((always_inline))                 \
   C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
     return a + b;                                \
diff --git a/src/math/mod.rs b/src/math/mod.rs
index e5f7553..8cef6bf 100644
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -1,4 +1,5 @@
 use crate::{poison::MaybePoison, FF32, FF64};
+use core::ops::{Add, Div, Mul, Rem, Sub};
 use paste::paste;
 
 impl FF32 {
@@ -75,12 +76,95 @@ macro_rules! impl_generic_math {
     };
 }
 
+macro_rules! impl_binary_refs {
+    ($lhs:ident, $rhs:ident, $op_trait:ident, $op_fn:ident) => {
+        impl $op_trait<$rhs> for &$lhs {
+            type Output = <$lhs as $op_trait<$rhs>>::Output;
+
+            #[inline]
+            fn $op_fn(self, other: $rhs) -> Self::Output {
+                (*self).$op_fn(other)
+            }
+        }
+        impl $op_trait<&$rhs> for $lhs {
+            type Output = <$lhs as $op_trait<$rhs>>::Output;
+
+            #[inline]
+            fn $op_fn(self, other: &$rhs) -> Self::Output {
+                self.$op_fn(*other)
+            }
+        }
+        impl $op_trait<&$rhs> for &$lhs {
+            type Output = <$lhs as $op_trait<$rhs>>::Output;
+
+            #[inline]
+            fn $op_fn(self, other: &$rhs) -> Self::Output {
+                (*self).$op_fn(*other)
+            }
+        }
+    };
+}
+
+macro_rules! impl_fast_ops {
+    ($fast_ty:ident, $base_ty: ident: $($op_trait:ident, $op_fn:ident, $op_impl:ident,)*) => {
+        $(
+            impl $op_trait <$fast_ty> for $fast_ty {
+                type Output = $fast_ty;
+
+                #[inline(always)]
+                fn $op_fn(self, other: $fast_ty) -> Self::Output {
+                    unsafe { $op_impl(self, other) }
+                }
+            }
+
+            impl $op_trait <$base_ty> for $fast_ty {
+                type Output = $fast_ty;
+
+                #[inline(always)]
+                fn $op_fn(self, other: $base_ty) -> Self::Output {
+                    self.$op_fn(<$fast_ty>::new(other))
+                }
+            }
+
+            impl $op_trait <$fast_ty> for $base_ty {
+                type Output = $fast_ty;
+
+                #[inline(always)]
+                fn $op_fn(self, other: $fast_ty) -> Self::Output {
+                    <$fast_ty>::new(self).$op_fn(other)
+                }
+            }
+
+            impl_binary_refs! { $fast_ty, $fast_ty, $op_trait, $op_fn }
+            impl_binary_refs! { $fast_ty, $base_ty, $op_trait, $op_fn }
+            impl_binary_refs! { $base_ty, $fast_ty, $op_trait, $op_fn }
+        )*
+    };
+}
+
 macro_rules! impl_extern_math {
     ($fast_ty:ident, $base_ty:ident) => {
         paste! {
             extern "C" {
+                fn [<add_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+                fn [<sub_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+                fn [<mul_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+                fn [<div_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+                fn [<rem_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+
                 fn [<min_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
                 fn [<max_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+
+                fn [<sqrt_ $base_ty>](a: $fast_ty) -> $fast_ty;
+            }
+
+            impl_fast_ops! {
+                $fast_ty, $base_ty:
+                Add, add, [<add_ $base_ty>],
+                Sub, sub, [<sub_ $base_ty>],
+                Mul, mul, [<mul_ $base_ty>],
+                Div, div, [<div_ $base_ty>],
+                Rem, rem, [<rem_ $base_ty>],
             }
 
             impl $fast_ty {
@@ -93,6 +177,11 @@ macro_rules! impl_extern_math {
                 pub fn min(self, other: Self) -> Self {
                     unsafe { [<min_ $base_ty>](self, other) }
                 }
+
+                #[inline]
+                pub fn sqrt(self) -> Self {
+                    unsafe { [<sqrt_ $base_ty>](self) }
+                }
             }
         }
     };