Proof of concept

2021-11-04 15:35:47 -07:00
parent 4b04b01fda
commit 87b0aeab0c
7 changed files with 393 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "fast_fp"
+version = "0.1.0"
+authors = ["Renar Narubin <renar@standard.ai>"]
+edition = "2018"
+readme = "README.md"
+license = "MIT OR Apache-2.0"
+
+[[bench]]
+name = "operations"
+harness = false
+
+[dependencies]
+num-traits = { version = "0.2", optional = true }
+
+[dev-dependencies]
+criterion = "0.3"
+rand = "0.8"
--- a/README.md
+++ b/README.md
@@ -0,0 +1,40 @@
+# Fast Floating-Point Math
+
+`fast_fp` provides a set of primitive types that support [fast-math]
+optimizations for many operations. These optimizations allow the compiler to
+potentially generate faster code by relaxing some of the requirements of [IEEE
+754] floating-point arithmetic.
+
+This may result in different outputs than operations on the standard float
+primitives like `f32`, particularly where fine-grained precision is important.
+`fast-math` may allow reordering operations in such a way that some precision
+is lost in the overall computation. Note that there are also cases where
+fast-math optimizations can _improve_ precision, such as contracting separate
+multiplication and addition into a fused multiply-add operation.
+
+## Limitations
+
+In order to enable these optimizations safely, certain requirements must be
+observed:
+
+- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an
+	operation are, or the results of an operation _would_ be, `+inf`, `-inf`,
+	or `NaN`, then the operation's result value is unspecified. This crate goes
+	to lengths to ensure that such an operation is not Undefined Behavior in the
+	strict sense, but the output is free to be any representable value of the
+	output type, and may not be a fixed value at all.
+- Use of this crate's primitives may not be faster than the standard primitives
+	in all cases. That may be because the generated code is slower in practice,
+	or because of certain measures taken by this crate to prevent UB (in
+	particular for comparison heavy code). Users should carefully measure and
+	benchmark their code to understand whether they actually benefit from use of
+	these types.
+- The safety of this crate is only assessed against rustc's LLVM code
+	generation. This crate should not be used with alternative code generators
+	such as cranelift or GCC
+- Signed-ness of zeros may be treated as insignificant and not preserved
+
+[TODO]: # (is there a way to detect the code generator at build time?)
+
+[fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags
+[IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754
--- a/benches/operations.rs
+++ b/benches/operations.rs
@@ -0,0 +1,34 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use fast_fp::{ff32, FF32};
+use rand::{distributions::Standard, thread_rng, Rng};
+
+fn sum(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sum");
+    for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
+        group.throughput(Throughput::Elements(count as u64));
+
+        let f32_vals = thread_rng()
+            .sample_iter(Standard)
+            .take(count)
+            .collect::<Vec<f32>>();
+
+        // use the same values for both benchmarks
+        let ff32_vals = f32_vals
+            .clone()
+            .into_iter()
+            .map(ff32)
+            .collect::<Vec<FF32>>();
+
+        group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
+            b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
+        });
+
+        group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
+            b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, sum);
+criterion_main!(benches);
--- a/1
+++ b/1
@@ -0,0 +1 @@
+nightly-2021-11-03
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -0,0 +1,224 @@
+#![doc = include_str!("../README.md")]
+#![feature(core_intrinsics)] // intrinsics for the fast math
+#![feature(asm)] // asm used to emulate freeze
+use core::{cmp, fmt, intrinsics::fadd_fast, ops};
+
+mod poison;
+use poison::MaybePoison;
+
+/// The error returned by the checked constructors of [`FF32`] and [`FF64`]
+#[derive(Clone, Debug, PartialEq)]
+pub struct InvalidValueError {
+    _priv: (),
+}
+
+impl fmt::Display for InvalidValueError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str("value may not be infinite or NaN")
+    }
+}
+
+impl std::error::Error for InvalidValueError {}
+
+// The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified
+// values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison
+// values have a relatively consistent behavior (stuff like transitivity), defined cases for UB,
+// and importantly can be limited in scope by freezing to a fixed value.
+//
+// This library handles poison by limiting its reach to only the pure arithmetic operations on the
+// wrapper float types. Any arbitrary FF32 is considered possibly invalid (containing +-inf or NaN)
+// because it's not feasible to track validity (without running all operations in parallel with
+// unfast-math and thus negating any possible improvement). Float add/sub/mul/div/rem are permitted
+// on the possibly poison values (as documented by LLVM), producing transitively poison results,
+// then wrapped in FF32. Any other operations require the value to be not-poison in order to be
+// not-UB: anything like comparison/printing/conversion/casting/etc is done on frozen copies of
+// the data. Originating values that were valid will pass through the arithmetic and freezing
+// exactly as they are; invalid values will become poison through the arithmetic and then be frozen
+// to some unspecified value. The user may encounter garbage in such a case, but not in a way that
+// triggers UB.
+//
+// Prior art and references
+//
+// https://github.com/rust-lang/rust/issues/21690
+// Task for general purpose fast-math in rust lang. Discussions about the right approach
+// and generalizability, including whether it should be type-based or annotation based. fast_fp
+// uses types wrapping intrinsics because it's the only option available in user space, and gets
+// good optimizations useful in practice
+//
+// https://docs.rs/fast-floats/0.2.0/fast_floats/index.html
+// Another crate that wraps fast intrinsics in types. They didn't address poison propagation,
+// leaving constructors unsafe
+//
+// https://llvm.org/docs/LangRef.html#fast-math-flags
+// LLVM's documentation on fast-math
+//
+// https://llvm.org/docs/LangRef.html#poisonvalues
+// LLVM's documentation on poison
+//
+// https://github.com/rust-lang/unsafe-code-guidelines/issues/71
+// notes on the validity of primitive bit patterns
+
+/// A wrapper over `f32` which enables fast-math optimizations.
+// TODO how best to document unspecified values, including witnessing possibly varying values
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+pub struct FF32(MaybePoison<f32>);
+
+impl FF32 {
+    /// Create a new `FF32` instance from the given float value.
+    ///
+    /// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must
+    /// not produce infinite or NaN results. The output of any such operation is unspecified.
+    #[inline(always)]
+    pub const fn new(f: f32) -> Self {
+        FF32(MaybePoison::new(f))
+    }
+
+    /// Create a new `FF32` instance from the given float value, returning an error if the value is
+    /// infinite or NaN.
+    ///
+    /// Note that this check is **not sufficient** to avoid all unspecified outputs, because an
+    /// operation could otherwise produce an invalid value with valid inputs (for example
+    /// `ff32(1.0) / ff32(0.0)` is unspecified). Nevertheless, this check can be useful for
+    /// limited best-effort validation.
+    #[inline(always)]
+    pub fn new_checked(f: f32) -> Result<Self, InvalidValueError> {
+        // finite also checks for NaN
+        if f.is_finite() {
+            Ok(FF32::new(f))
+        } else {
+            Err(InvalidValueError { _priv: () })
+        }
+    }
+
+    #[inline(always)]
+    fn freeze_f32(self) -> f32 {
+        let inner = self.0.freeze();
+
+        // Safety:
+        // every bit pattern is valid in float
+        unsafe { inner.assume_init() }
+    }
+}
+
+/// Create a new `FF32` instance from the given float value.
+///
+/// This is syntax sugar for constructing the `FF32` type, and equivalent to `FF32::new(f)`
+///
+/// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must
+/// not produce infinite or NaN results. The output of any such operation is unspecified.
+#[inline(always)]
+pub fn ff32(f: f32) -> FF32 {
+    // TODO maybe a feature flag to make this checked -> panic?
+    FF32::new(f)
+}
+
+impl ops::Add<FF32> for FF32 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, other: FF32) -> Self {
+        // Safety:
+        //
+        // - dereferencing the pointers is safe because every bit pattern is valid in float
+        // primitives
+        // - encountering poison operands is safe because LLVM's fast add documents not producing
+        // UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
+        // are then wrapped in the MaybePoison to control propagation
+        ff32(unsafe {
+            fadd_fast(
+                *self.0.maybe_poison().as_ptr(),
+                *other.0.maybe_poison().as_ptr(),
+            )
+        })
+    }
+}
+
+// Branching on poison values is UB, so any operation that makes a bool is protected by freezing
+// the operands. This includes [Partial]Eq and [Partial]Ord.
+//
+// Note however that only value copies are frozen; the original values may still be poison, and
+// could even yield different concrete values on a subsequent freeze. This means that potentially
+// the values are not Eq/Ord consistent. Logical consistency is left as a responsibility of
+// the user, to maintain non inf/nan values, while the lib only ensures safety.
+
+impl PartialEq<FF32> for FF32 {
+    #[inline]
+    fn eq(&self, other: &FF32) -> bool {
+        let this = self.freeze_f32();
+        let that = other.freeze_f32();
+
+        this == that
+    }
+}
+
+impl Eq for FF32 {}
+
+impl PartialOrd<FF32> for FF32 {
+    #[inline(always)]
+    fn partial_cmp(&self, other: &FF32) -> Option<cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+
+    #[inline(always)]
+    fn lt(&self, other: &FF32) -> bool {
+        self.freeze_f32() < other.freeze_f32()
+    }
+
+    #[inline(always)]
+    fn le(&self, other: &FF32) -> bool {
+        self.freeze_f32() <= other.freeze_f32()
+    }
+
+    #[inline(always)]
+    fn gt(&self, other: &FF32) -> bool {
+        self.freeze_f32() > other.freeze_f32()
+    }
+
+    #[inline(always)]
+    fn ge(&self, other: &FF32) -> bool {
+        self.freeze_f32() >= other.freeze_f32()
+    }
+}
+
+impl Ord for FF32 {
+    #[inline(always)]
+    fn cmp(&self, other: &FF32) -> cmp::Ordering {
+        let this = self.freeze_f32();
+        let that = other.freeze_f32();
+
+        // Note NaNs are not supported (and would break everything else anyway) so we ignore them
+        // and implement full Ord
+        if this < that {
+            cmp::Ordering::Less
+        } else if this > that {
+            cmp::Ordering::Greater
+        } else {
+            cmp::Ordering::Equal
+        }
+    }
+
+    #[inline]
+    fn clamp(self, min: FF32, max: FF32) -> FF32 {
+        ff32(f32::clamp(
+            self.freeze_f32(),
+            min.freeze_f32(),
+            max.freeze_f32(),
+        ))
+    }
+}
+
+impl From<FF32> for f32 {
+    fn from(from: FF32) -> Self {
+        // f32 is no longer in our API control, so we must stop poison propagation by freezing
+        from.freeze_f32()
+    }
+}
+
+impl From<f32> for FF32 {
+    fn from(from: f32) -> Self {
+        ff32(from)
+    }
+}
+
+// TODO FF64, macro everything, more ops, libm?
--- a/src/poison.rs
+++ b/src/poison.rs
@@ -0,0 +1,74 @@
+use core::mem::MaybeUninit;
+
+/// A wrapper used to model LLVM's [poison
+/// values](https://llvm.org/docs/LangRef.html#poisonvalues)
+#[derive(Copy)]
+#[repr(transparent)]
+pub(crate) struct MaybePoison<T>(MaybeUninit<T>);
+
+impl<T: Copy> Clone for MaybePoison<T> {
+    #[inline(always)]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T> MaybePoison<T> {
+    #[inline(always)]
+    pub(crate) const fn new(t: T) -> Self {
+        MaybePoison(MaybeUninit::new(t))
+    }
+
+    /// Get the (possibly poison) value from this instance.
+    ///
+    /// The compiler may relax poison values to undefined values. That means, among other
+    /// consequences, that calls to this function from copies of the same value could manifest
+    /// different return values. Poison values are also transitive: an instruction that depends on
+    /// a poison value, produces a poison value itself.
+    ///
+    /// Propogation of poison values can be stopped using [`freeze`](MaybePoison::freeze)
+    ///
+    /// # Safety
+    ///
+    /// It is UB to use a poison value as an operand to an instruction where _any_ of the operand's
+    /// values trigger UB. This includes, for example, use as the divisor in integer division, or
+    /// as the condition of a branch.
+    ///
+    /// See more examples and explanations in the [LLVM
+    /// documentation](https://llvm.org/docs/LangRef.html#poisonvalues)
+    #[inline(always)]
+    pub(crate) unsafe fn maybe_poison(self) -> MaybeUninit<T> {
+        self.0
+    }
+
+    /// Freeze the poisoned value into a concrete (but arbitrary) value.
+    ///
+    /// Note that the value may not be a valid representation of T, so the return type is still
+    /// unsafe to dereference unless T is valid with any representation.
+    #[inline(always)]
+    pub(crate) fn freeze(self) -> MaybeUninit<T> {
+        // As of this writing, rust does not have any intrinsic to call LLVM's freeze instruction.
+        // Instead, we do the next best thing by tricking the compiler into de-optimizing poison
+        // values by introducing inline assembly. This is the same technique used by
+        // `core::hint::black_box` and (the unmerged) https://github.com/rust-lang/rust/pull/58363.
+        // We cannot use black_box directly, however, as it is documented as only a best-effort
+        // hint, and could in theory be changed in the future.
+
+        // Safety:
+        //
+        // - The poison value will no longer be poisoned, its safety restrictions no longer apply
+        // - The asm macro emits no actual assembly, there's nothing to be unsafe
+        unsafe {
+            let inner = self.maybe_poison();
+            // There is no actual assembly, it's just a trick to restrict the compiler from
+            // optimizing around poison values. However the asm macro requires the format
+            // string to capture all inputs, so put the captured pointer in an assembly comment.
+            // The possibly poison value is labelled as input to the assembly block by providing a
+            // pointer to the value; the compiler then must assume that anything could be done with
+            // that pointer (e.g. reading and writing the value) so the compiler must materialize
+            // a concrete (though arbitrary) value before the assembly
+            asm!("/* {0} */", in(reg) inner.as_ptr(), options(nostack, preserves_flags));
+            inner
+        }
+    }
+}