diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f17d09e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "fast_fp" +version = "0.1.0" +authors = ["Renar Narubin "] +edition = "2018" +readme = "README.md" +license = "MIT OR Apache-2.0" + +[[bench]] +name = "operations" +harness = false + +[dependencies] +num-traits = { version = "0.2", optional = true } + +[dev-dependencies] +criterion = "0.3" +rand = "0.8" diff --git a/README.md b/README.md new file mode 100644 index 0000000..8fd7ad9 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Fast Floating-Point Math + +`fast_fp` provides a set of primitive types that support [fast-math] +optimizations for many operations. These optimizations allow the compiler to +potentially generate faster code by relaxing some of the requirements of [IEEE +754] floating-point arithmetic. + +This may result in different outputs than operations on the standard float +primitives like `f32`, particularly where fine-grained precision is important. +`fast-math` may allow reordering operations in such a way that some precision +is lost in the overall computation. Note that there are also cases where +fast-math optimizations can _improve_ precision, such as contracting separate +multiplication and addition into a fused multiply-add operation. + +## Limitations + +In order to enable these optimizations safely, certain requirements must be +observed: + +- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an + operation are, or the results of an operation _would_ be, `+inf`, `-inf`, + or `NaN`, then the operation's result value is unspecified. This crate goes + to lengths to ensure that such an operation is not Undefined Behavior in the + strict sense, but the output is free to be any representable value of the + output type, and may not be a fixed value at all. +- Use of this crate's primitives may not be faster than the standard primitives + in all cases. That may be because the generated code is slower in practice, + or because of certain measures taken by this crate to prevent UB (in + particular for comparison heavy code). Users should carefully measure and + benchmark their code to understand whether they actually benefit from use of + these types. +- The safety of this crate is only assessed against rustc's LLVM code + generation. This crate should not be used with alternative code generators + such as cranelift or GCC +- Signed-ness of zeros may be treated as insignificant and not preserved + +[TODO]: # (is there a way to detect the code generator at build time?) + +[fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags +[IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754 diff --git a/benches/operations.rs b/benches/operations.rs new file mode 100644 index 0000000..e4cabe2 --- /dev/null +++ b/benches/operations.rs @@ -0,0 +1,34 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use fast_fp::{ff32, FF32}; +use rand::{distributions::Standard, thread_rng, Rng}; + +fn sum(c: &mut Criterion) { + let mut group = c.benchmark_group("sum"); + for count in [2, 4, 8, 16, 64, 1024, 1 << 15] { + group.throughput(Throughput::Elements(count as u64)); + + let f32_vals = thread_rng() + .sample_iter(Standard) + .take(count) + .collect::>(); + + // use the same values for both benchmarks + let ff32_vals = f32_vals + .clone() + .into_iter() + .map(ff32) + .collect::>(); + + group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| { + b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val)); + }); + + group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| { + b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val)); + }); + } + group.finish(); +} + +criterion_group!(benches, sum); +criterion_main!(benches); diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000..eb726a2 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +nightly-2021-11-03 diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..585fd12 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,224 @@ +#![doc = include_str!("../README.md")] +#![feature(core_intrinsics)] // intrinsics for the fast math +#![feature(asm)] // asm used to emulate freeze +use core::{cmp, fmt, intrinsics::fadd_fast, ops}; + +mod poison; +use poison::MaybePoison; + +/// The error returned by the checked constructors of [`FF32`] and [`FF64`] +#[derive(Clone, Debug, PartialEq)] +pub struct InvalidValueError { + _priv: (), +} + +impl fmt::Display for InvalidValueError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("value may not be infinite or NaN") + } +} + +impl std::error::Error for InvalidValueError {} + +// The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified +// values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison +// values have a relatively consistent behavior (stuff like transitivity), defined cases for UB, +// and importantly can be limited in scope by freezing to a fixed value. +// +// This library handles poison by limiting its reach to only the pure arithmetic operations on the +// wrapper float types. Any arbitrary FF32 is considered possibly invalid (containing +-inf or NaN) +// because it's not feasible to track validity (without running all operations in parallel with +// unfast-math and thus negating any possible improvement). Float add/sub/mul/div/rem are permitted +// on the possibly poison values (as documented by LLVM), producing transitively poison results, +// then wrapped in FF32. Any other operations require the value to be not-poison in order to be +// not-UB: anything like comparison/printing/conversion/casting/etc is done on frozen copies of +// the data. Originating values that were valid will pass through the arithmetic and freezing +// exactly as they are; invalid values will become poison through the arithmetic and then be frozen +// to some unspecified value. The user may encounter garbage in such a case, but not in a way that +// triggers UB. +// +// Prior art and references +// +// https://github.com/rust-lang/rust/issues/21690 +// Task for general purpose fast-math in rust lang. Discussions about the right approach +// and generalizability, including whether it should be type-based or annotation based. fast_fp +// uses types wrapping intrinsics because it's the only option available in user space, and gets +// good optimizations useful in practice +// +// https://docs.rs/fast-floats/0.2.0/fast_floats/index.html +// Another crate that wraps fast intrinsics in types. They didn't address poison propagation, +// leaving constructors unsafe +// +// https://llvm.org/docs/LangRef.html#fast-math-flags +// LLVM's documentation on fast-math +// +// https://llvm.org/docs/LangRef.html#poisonvalues +// LLVM's documentation on poison +// +// https://github.com/rust-lang/unsafe-code-guidelines/issues/71 +// notes on the validity of primitive bit patterns + +/// A wrapper over `f32` which enables fast-math optimizations. +// TODO how best to document unspecified values, including witnessing possibly varying values +#[derive(Clone, Copy)] +#[repr(transparent)] +pub struct FF32(MaybePoison); + +impl FF32 { + /// Create a new `FF32` instance from the given float value. + /// + /// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must + /// not produce infinite or NaN results. The output of any such operation is unspecified. + #[inline(always)] + pub const fn new(f: f32) -> Self { + FF32(MaybePoison::new(f)) + } + + /// Create a new `FF32` instance from the given float value, returning an error if the value is + /// infinite or NaN. + /// + /// Note that this check is **not sufficient** to avoid all unspecified outputs, because an + /// operation could otherwise produce an invalid value with valid inputs (for example + /// `ff32(1.0) / ff32(0.0)` is unspecified). Nevertheless, this check can be useful for + /// limited best-effort validation. + #[inline(always)] + pub fn new_checked(f: f32) -> Result { + // finite also checks for NaN + if f.is_finite() { + Ok(FF32::new(f)) + } else { + Err(InvalidValueError { _priv: () }) + } + } + + #[inline(always)] + fn freeze_f32(self) -> f32 { + let inner = self.0.freeze(); + + // Safety: + // every bit pattern is valid in float + unsafe { inner.assume_init() } + } +} + +/// Create a new `FF32` instance from the given float value. +/// +/// This is syntax sugar for constructing the `FF32` type, and equivalent to `FF32::new(f)` +/// +/// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must +/// not produce infinite or NaN results. The output of any such operation is unspecified. +#[inline(always)] +pub fn ff32(f: f32) -> FF32 { + // TODO maybe a feature flag to make this checked -> panic? + FF32::new(f) +} + +impl ops::Add for FF32 { + type Output = Self; + + #[inline(always)] + fn add(self, other: FF32) -> Self { + // Safety: + // + // - dereferencing the pointers is safe because every bit pattern is valid in float + // primitives + // - encountering poison operands is safe because LLVM's fast add documents not producing + // UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these + // are then wrapped in the MaybePoison to control propagation + ff32(unsafe { + fadd_fast( + *self.0.maybe_poison().as_ptr(), + *other.0.maybe_poison().as_ptr(), + ) + }) + } +} + +// Branching on poison values is UB, so any operation that makes a bool is protected by freezing +// the operands. This includes [Partial]Eq and [Partial]Ord. +// +// Note however that only value copies are frozen; the original values may still be poison, and +// could even yield different concrete values on a subsequent freeze. This means that potentially +// the values are not Eq/Ord consistent. Logical consistency is left as a responsibility of +// the user, to maintain non inf/nan values, while the lib only ensures safety. + +impl PartialEq for FF32 { + #[inline] + fn eq(&self, other: &FF32) -> bool { + let this = self.freeze_f32(); + let that = other.freeze_f32(); + + this == that + } +} + +impl Eq for FF32 {} + +impl PartialOrd for FF32 { + #[inline(always)] + fn partial_cmp(&self, other: &FF32) -> Option { + Some(self.cmp(other)) + } + + #[inline(always)] + fn lt(&self, other: &FF32) -> bool { + self.freeze_f32() < other.freeze_f32() + } + + #[inline(always)] + fn le(&self, other: &FF32) -> bool { + self.freeze_f32() <= other.freeze_f32() + } + + #[inline(always)] + fn gt(&self, other: &FF32) -> bool { + self.freeze_f32() > other.freeze_f32() + } + + #[inline(always)] + fn ge(&self, other: &FF32) -> bool { + self.freeze_f32() >= other.freeze_f32() + } +} + +impl Ord for FF32 { + #[inline(always)] + fn cmp(&self, other: &FF32) -> cmp::Ordering { + let this = self.freeze_f32(); + let that = other.freeze_f32(); + + // Note NaNs are not supported (and would break everything else anyway) so we ignore them + // and implement full Ord + if this < that { + cmp::Ordering::Less + } else if this > that { + cmp::Ordering::Greater + } else { + cmp::Ordering::Equal + } + } + + #[inline] + fn clamp(self, min: FF32, max: FF32) -> FF32 { + ff32(f32::clamp( + self.freeze_f32(), + min.freeze_f32(), + max.freeze_f32(), + )) + } +} + +impl From for f32 { + fn from(from: FF32) -> Self { + // f32 is no longer in our API control, so we must stop poison propagation by freezing + from.freeze_f32() + } +} + +impl From for FF32 { + fn from(from: f32) -> Self { + ff32(from) + } +} + +// TODO FF64, macro everything, more ops, libm? diff --git a/src/poison.rs b/src/poison.rs new file mode 100644 index 0000000..d0f7e9a --- /dev/null +++ b/src/poison.rs @@ -0,0 +1,74 @@ +use core::mem::MaybeUninit; + +/// A wrapper used to model LLVM's [poison +/// values](https://llvm.org/docs/LangRef.html#poisonvalues) +#[derive(Copy)] +#[repr(transparent)] +pub(crate) struct MaybePoison(MaybeUninit); + +impl Clone for MaybePoison { + #[inline(always)] + fn clone(&self) -> Self { + *self + } +} + +impl MaybePoison { + #[inline(always)] + pub(crate) const fn new(t: T) -> Self { + MaybePoison(MaybeUninit::new(t)) + } + + /// Get the (possibly poison) value from this instance. + /// + /// The compiler may relax poison values to undefined values. That means, among other + /// consequences, that calls to this function from copies of the same value could manifest + /// different return values. Poison values are also transitive: an instruction that depends on + /// a poison value, produces a poison value itself. + /// + /// Propogation of poison values can be stopped using [`freeze`](MaybePoison::freeze) + /// + /// # Safety + /// + /// It is UB to use a poison value as an operand to an instruction where _any_ of the operand's + /// values trigger UB. This includes, for example, use as the divisor in integer division, or + /// as the condition of a branch. + /// + /// See more examples and explanations in the [LLVM + /// documentation](https://llvm.org/docs/LangRef.html#poisonvalues) + #[inline(always)] + pub(crate) unsafe fn maybe_poison(self) -> MaybeUninit { + self.0 + } + + /// Freeze the poisoned value into a concrete (but arbitrary) value. + /// + /// Note that the value may not be a valid representation of T, so the return type is still + /// unsafe to dereference unless T is valid with any representation. + #[inline(always)] + pub(crate) fn freeze(self) -> MaybeUninit { + // As of this writing, rust does not have any intrinsic to call LLVM's freeze instruction. + // Instead, we do the next best thing by tricking the compiler into de-optimizing poison + // values by introducing inline assembly. This is the same technique used by + // `core::hint::black_box` and (the unmerged) https://github.com/rust-lang/rust/pull/58363. + // We cannot use black_box directly, however, as it is documented as only a best-effort + // hint, and could in theory be changed in the future. + + // Safety: + // + // - The poison value will no longer be poisoned, its safety restrictions no longer apply + // - The asm macro emits no actual assembly, there's nothing to be unsafe + unsafe { + let inner = self.maybe_poison(); + // There is no actual assembly, it's just a trick to restrict the compiler from + // optimizing around poison values. However the asm macro requires the format + // string to capture all inputs, so put the captured pointer in an assembly comment. + // The possibly poison value is labelled as input to the assembly block by providing a + // pointer to the value; the compiler then must assume that anything could be done with + // that pointer (e.g. reading and writing the value) so the compiler must materialize + // a concrete (though arbitrary) value before the assembly + asm!("/* {0} */", in(reg) inner.as_ptr(), options(nostack, preserves_flags)); + inner + } + } +}