diff --git a/Cargo.toml b/Cargo.toml
index 2132a06..61e1e5a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,10 +6,17 @@ edition = "2018"
 readme = "README.md"
 license = "MIT OR Apache-2.0"
 
+[lib]
+bench = false
+
 [[bench]]
 name = "operations"
 harness = false
 
+[[bench]]
+name = "math"
+harness = false
+
 [features]
 default = ["num-traits"]
 
@@ -17,6 +24,8 @@ nalgebra-v021 = ["num-traits", "nalgebra_v021", "simba_v01", "approx_v03"]
 nalgebra-v029 = ["num-traits", "nalgebra_v029", "simba_v06", "approx_v05"]
 
 [dependencies]
+paste = "1"
+
 num-traits = { version = "0.2", optional = true }
 
 approx_v03 = { package = "approx", version = "0.3", optional = true }
@@ -27,6 +36,9 @@ approx_v05 = { package = "approx", version = "0.5", optional = true }
 nalgebra_v029 = { package = "nalgebra", version = "0.29", optional = true }
 simba_v06 = { package = "simba", version = "0.6", optional = true }
 
+[build-dependencies]
+cc = "1"
+
 [dev-dependencies]
 criterion = { version = "0.3", features = ["html_reports"] }
 rand = "0.8"
@@ -34,3 +46,7 @@ rand = "0.8"
 [profile.test]
 # run tests at high optimization to exercise typical codegen
 opt-level = 3
+
+[profile.release]
+lto = "fat"
+codegen-units = 1
diff --git a/benches/math.rs b/benches/math.rs
new file mode 100644
index 0000000..9dbc7a1
--- /dev/null
+++ b/benches/math.rs
@@ -0,0 +1,38 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use fast_fp::{ff32, FF32};
+use rand::{distributions::Standard, thread_rng, Rng};
+
+fn min(c: &mut Criterion) {
+    let mut group = c.benchmark_group("min");
+    for count in [2, 8, 32, 1024] {
+        group.throughput(Throughput::Elements(count as u64));
+
+        let f32_vals = thread_rng()
+            .sample_iter(Standard)
+            .take(count)
+            .collect::<Vec<f32>>();
+
+        // use the same values for both benchmarks
+        let ff32_vals = f32_vals
+            .clone()
+            .into_iter()
+            .map(ff32)
+            .collect::<Vec<FF32>>();
+
+        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
+            b.iter(|| vals.iter().copied().fold(f32::MAX, |acc, val| acc.min(val)));
+        });
+
+        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
+            b.iter(|| {
+                vals.iter()
+                    .copied()
+                    .fold(FF32::MAX, |acc, val| acc.min(val))
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, min);
+criterion_main!(benches);
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..7ac7c0f
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,20 @@
+fn main() {
+    let mut builder = cc::Build::new();
+
+    if !builder.get_compiler().is_like_clang() {
+        // if the default/configured cc is not clang, try to call clang manually
+        builder.compiler("clang");
+    }
+
+    builder
+        .file("src/math/math.c")
+        .flag("-O3")
+        .flag("-flto=thin")
+        .flag("-ffinite-math-only")
+        .flag("-fassociative-math")
+        .flag("-freciprocal-math")
+        .flag("-fno-signed-zeros")
+        .flag("-fno-trapping-math")
+        .flag("-ffp-contract=fast")
+        .compile("math")
+}
diff --git a/src/lib.rs b/src/lib.rs
index 8e8be61..e886e9a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,8 @@
 #![feature(core_intrinsics)] // intrinsics for the fast math
 #![feature(asm)] // asm used to emulate freeze
 #![feature(doc_cfg)]
+#![feature(link_llvm_intrinsics)]
+
 use core::{
     cmp, fmt,
     intrinsics::{fadd_fast, fdiv_fast, fmul_fast, frem_fast, fsub_fast},
@@ -40,6 +42,7 @@ macro_rules! forward_freeze_self {
     };
 }
 
+mod math;
 mod nalgebra;
 mod num_traits;
 
@@ -347,7 +350,6 @@ macro_rules! impls {
             // TODO migrate these to native implementations to freeze less and fast-math more
             forward_freeze_self! {
                 $fast_ty, $base_ty
-                pub fn abs(self) -> Self;
                 pub fn acos(self) -> Self;
                 pub fn acosh(self) -> Self;
                 pub fn asin(self) -> Self;
@@ -358,7 +360,6 @@ macro_rules! impls {
                 pub fn cbrt(self) -> Self;
                 pub fn ceil(self) -> Self;
                 pub fn clamp(self, min: Self, max: Self) -> Self;
-                pub fn copysign(self, sign: Self) -> Self;
                 pub fn cos(self) -> Self;
                 pub fn cosh(self) -> Self;
                 pub fn div_euclid(self, rhs: Self) -> Self;
@@ -367,20 +368,17 @@ macro_rules! impls {
                 pub fn exp_m1(self) -> Self;
                 pub fn floor(self) -> Self;
                 pub fn fract(self) -> Self;
-                pub fn hypot(self, other: Self) -> Self;
                 pub fn ln(self) -> Self;
                 pub fn ln_1p(self) -> Self;
                 pub fn log(self, base: Self) -> Self;
                 pub fn log10(self) -> Self;
                 pub fn log2(self) -> Self;
-                pub fn max(self, other: Self) -> Self;
-                pub fn min(self, other: Self) -> Self;
+                //pub fn max(self, other: Self) -> Self;
+                //pub fn min(self, other: Self) -> Self;
                 pub fn mul_add(self, a: Self, b: Self) -> Self;
                 pub fn powf(self, n: Self) -> Self;
-                pub fn recip(self) -> Self;
                 pub fn rem_euclid(self, rhs: Self) -> Self;
                 pub fn round(self) -> Self;
-                pub fn signum(self) -> Self;
                 pub fn sin(self) -> Self;
                 pub fn sinh(self) -> Self;
                 pub fn sqrt(self) -> Self;
diff --git a/src/math/math.c b/src/math/math.c
new file mode 100644
index 0000000..8eec462
--- /dev/null
+++ b/src/math/math.c
@@ -0,0 +1,76 @@
+#include <stdbool.h>
+#include <math.h>
+
+#define IMPL_OPERATIONS(C_TYPE, RUST_TYPE)       \
+  /* TODO figure out why these don't inline */   \
+  __attribute__((always_inline))                 \
+  C_TYPE add_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
+    return a + b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  C_TYPE sub_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
+    return a - b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  C_TYPE mul_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
+    return a * b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  C_TYPE div_ ## RUST_TYPE(C_TYPE a, C_TYPE b) { \
+    return a / b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  bool eq_ ## RUST_TYPE(C_TYPE a, C_TYPE b) {    \
+    return a == b;                               \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  bool lt_ ## RUST_TYPE(C_TYPE a, C_TYPE b) {    \
+    return a < b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  bool le_ ## RUST_TYPE(C_TYPE a, C_TYPE b) {    \
+    return a <= b;                               \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  bool gt_ ## RUST_TYPE(C_TYPE a, C_TYPE b) {    \
+    return a > b;                                \
+  }                                              \
+                                                 \
+  __attribute__((always_inline))                 \
+  bool ge_ ## RUST_TYPE(C_TYPE a, C_TYPE b) {    \
+    return a >= b;                               \
+  }                                              \
+
+#define IMPL_UNARY_FUNCTION(C_TYPE, RUST_TYPE, FN_NAME, FN_IMPL) \
+  __attribute__((always_inline))                                 \
+  C_TYPE FN_NAME ## _ ## RUST_TYPE(C_TYPE a) {                   \
+    return FN_IMPL(a);                                           \
+  }                                                              \
+
+#define IMPL_BINARY_FUNCTION(C_TYPE, RUST_TYPE, FN_NAME, FN_IMPL) \
+  __attribute__((always_inline))                                  \
+  C_TYPE FN_NAME ## _ ## RUST_TYPE(C_TYPE a, C_TYPE b) {          \
+    return FN_IMPL(a, b);                                         \
+  }                                                               \
+
+IMPL_OPERATIONS(float, f32)
+IMPL_OPERATIONS(double, f64)
+
+IMPL_UNARY_FUNCTION(float, f32, sqrt, sqrtf)
+IMPL_UNARY_FUNCTION(double, f64, sqrt, sqrt)
+
+IMPL_BINARY_FUNCTION(float, f32, rem, fmodf)
+IMPL_BINARY_FUNCTION(double, f64, rem, fmod)
+
+IMPL_BINARY_FUNCTION(float, f32, max, fmaxf)
+IMPL_BINARY_FUNCTION(double, f64, max, fmax)
+
+IMPL_BINARY_FUNCTION(float, f32, min, fminf)
+IMPL_BINARY_FUNCTION(double, f64, min, fmin)
diff --git a/src/math/mod.rs b/src/math/mod.rs
new file mode 100644
index 0000000..93313cb
--- /dev/null
+++ b/src/math/mod.rs
@@ -0,0 +1,105 @@
+use crate::{poison::MaybePoison, FF32, FF64};
+use paste::paste;
+
+impl FF32 {
+    const SIGN_BIT: u32 = 0x8000_0000;
+    const UNSIGNED_MASK: u32 = 0x7fff_ffff;
+}
+
+impl FF64 {
+    const SIGN_BIT: u64 = 0x8000_0000_0000_0000;
+    const UNSIGNED_MASK: u64 = 0x7fff_ffff_ffff_ffff;
+}
+
+macro_rules! impl_generic_math {
+    ($fast_ty:ident, $base_ty:ident, $base_int:ident) => {
+        impl $fast_ty {
+            #[inline]
+            fn to_bits(self) -> MaybePoison<$base_int> {
+                // Safety:
+                //
+                // - `to_bits` should be valid for any input bits
+                // - poison propagation is controlled with MaybePoison
+                MaybePoison::new(unsafe { <$base_ty>::to_bits(*self.0.maybe_poison().as_ptr()) })
+            }
+
+            #[inline]
+            fn from_bits(bits: MaybePoison<$base_int>) -> Self {
+                // Safety:
+                //
+                // - `from_bits` should be valid for any input bits
+                // - poison propagation is controlled with MaybePoison
+                Self(MaybePoison::new(unsafe {
+                    <$base_ty>::from_bits(*bits.maybe_poison().as_ptr())
+                }))
+            }
+
+            #[inline]
+            pub fn abs(self) -> Self {
+                let bits = self.to_bits();
+                <$fast_ty>::from_bits(MaybePoison::new(unsafe {
+                    *bits.maybe_poison().as_ptr() & Self::UNSIGNED_MASK
+                }))
+            }
+
+            #[inline]
+            pub fn copysign(self, other: Self) -> Self {
+                let this = self.to_bits();
+                let that = other.to_bits();
+
+                // Safety:
+                //
+                // - & of poison is safe because & does not produce UB for any input values
+                // - poison propagation is handled by wrapping in maybe poison
+                <$fast_ty>::from_bits(MaybePoison::new(unsafe {
+                    (*this.maybe_poison().as_ptr() & Self::UNSIGNED_MASK)
+                        | (*that.maybe_poison().as_ptr() & Self::SIGN_BIT)
+                }))
+            }
+
+            #[inline]
+            pub fn hypot(self, other: Self) -> Self {
+                (self * self + other * other).sqrt()
+            }
+
+            #[inline]
+            pub fn signum(self) -> Self {
+                Self::ONE.copysign(self)
+            }
+
+            #[inline]
+            pub fn recip(self) -> Self {
+                Self::ONE / self
+            }
+        }
+    };
+}
+
+macro_rules! impl_extern_math {
+    ($fast_ty:ident, $base_ty:ident) => {
+        paste! {
+            extern "C" {
+                fn [<min_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+                fn [<max_ $base_ty>](a: $fast_ty, b: $fast_ty) -> $fast_ty;
+            }
+
+            impl $fast_ty {
+                #[inline]
+                pub fn max(self, other: Self) -> Self {
+                    unsafe { [<max_ $base_ty>](self, other) }
+                }
+
+                #[inline]
+                pub fn min(self, other: Self) -> Self {
+                    unsafe { [<min_ $base_ty>](self, other) }
+                }
+            }
+        }
+    };
+}
+
+impl_generic_math! { FF32, f32, u32 }
+impl_generic_math! { FF64, f64, u64 }
+
+impl_extern_math! { FF32, f32 }
+impl_extern_math! { FF64, f64 }