Proof of concept
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
/target
|
||||
Cargo.lock
|
||||
18
Cargo.toml
Normal file
18
Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "fast_fp"
|
||||
version = "0.1.0"
|
||||
authors = ["Renar Narubin <renar@standard.ai>"]
|
||||
edition = "2018"
|
||||
readme = "README.md"
|
||||
license = "MIT OR Apache-2.0"
|
||||
|
||||
[[bench]]
|
||||
name = "operations"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
num-traits = { version = "0.2", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3"
|
||||
rand = "0.8"
|
||||
40
README.md
Normal file
40
README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Fast Floating-Point Math
|
||||
|
||||
`fast_fp` provides a set of primitive types that support [fast-math]
|
||||
optimizations for many operations. These optimizations allow the compiler to
|
||||
potentially generate faster code by relaxing some of the requirements of [IEEE
|
||||
754] floating-point arithmetic.
|
||||
|
||||
This may result in different outputs than operations on the standard float
|
||||
primitives like `f32`, particularly where fine-grained precision is important.
|
||||
`fast-math` may allow reordering operations in such a way that some precision
|
||||
is lost in the overall computation. Note that there are also cases where
|
||||
fast-math optimizations can _improve_ precision, such as contracting separate
|
||||
multiplication and addition into a fused multiply-add operation.
|
||||
|
||||
## Limitations
|
||||
|
||||
In order to enable these optimizations safely, certain requirements must be
|
||||
observed:
|
||||
|
||||
- Operations **MUST NOT** involve infinite or NaN values. If the arguments to an
|
||||
operation are, or the results of an operation _would_ be, `+inf`, `-inf`,
|
||||
or `NaN`, then the operation's result value is unspecified. This crate goes
|
||||
to lengths to ensure that such an operation is not Undefined Behavior in the
|
||||
strict sense, but the output is free to be any representable value of the
|
||||
output type, and may not be a fixed value at all.
|
||||
- Use of this crate's primitives may not be faster than the standard primitives
|
||||
in all cases. That may be because the generated code is slower in practice,
|
||||
or because of certain measures taken by this crate to prevent UB (in
|
||||
particular for comparison heavy code). Users should carefully measure and
|
||||
benchmark their code to understand whether they actually benefit from use of
|
||||
these types.
|
||||
- The safety of this crate is only assessed against rustc's LLVM code
|
||||
generation. This crate should not be used with alternative code generators
|
||||
such as cranelift or GCC
|
||||
- Signed-ness of zeros may be treated as insignificant and not preserved
|
||||
|
||||
[TODO]: # (is there a way to detect the code generator at build time?)
|
||||
|
||||
[fast-math]: https://llvm.org/docs/LangRef.html#fast-math-flags
|
||||
[IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754
|
||||
34
benches/operations.rs
Normal file
34
benches/operations.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use fast_fp::{ff32, FF32};
|
||||
use rand::{distributions::Standard, thread_rng, Rng};
|
||||
|
||||
fn sum(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("sum");
|
||||
for count in [2, 4, 8, 16, 64, 1024, 1 << 15] {
|
||||
group.throughput(Throughput::Elements(count as u64));
|
||||
|
||||
let f32_vals = thread_rng()
|
||||
.sample_iter(Standard)
|
||||
.take(count)
|
||||
.collect::<Vec<f32>>();
|
||||
|
||||
// use the same values for both benchmarks
|
||||
let ff32_vals = f32_vals
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(ff32)
|
||||
.collect::<Vec<FF32>>();
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("std::f32", count), &f32_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(0.0, |acc, val| acc + val));
|
||||
});
|
||||
|
||||
group.bench_with_input(BenchmarkId::new("FF32", count), &ff32_vals, |b, vals| {
|
||||
b.iter(|| vals.iter().copied().fold(ff32(0.0), |acc, val| acc + val));
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, sum);
|
||||
criterion_main!(benches);
|
||||
1
rust-toolchain
Normal file
1
rust-toolchain
Normal file
@@ -0,0 +1 @@
|
||||
nightly-2021-11-03
|
||||
224
src/lib.rs
Normal file
224
src/lib.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
#![doc = include_str!("../README.md")]
|
||||
#![feature(core_intrinsics)] // intrinsics for the fast math
|
||||
#![feature(asm)] // asm used to emulate freeze
|
||||
use core::{cmp, fmt, intrinsics::fadd_fast, ops};
|
||||
|
||||
mod poison;
|
||||
use poison::MaybePoison;
|
||||
|
||||
/// The error returned by the checked constructors of [`FF32`] and [`FF64`]
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct InvalidValueError {
|
||||
_priv: (),
|
||||
}
|
||||
|
||||
impl fmt::Display for InvalidValueError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("value may not be infinite or NaN")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for InvalidValueError {}
|
||||
|
||||
// The big challenge with fast-math in general is avoiding UB, and to a lesser extent unspecified
|
||||
// values. LLVM's fast operations document "poison" behavior when given invalid inputs; poison
|
||||
// values have a relatively consistent behavior (stuff like transitivity), defined cases for UB,
|
||||
// and importantly can be limited in scope by freezing to a fixed value.
|
||||
//
|
||||
// This library handles poison by limiting its reach to only the pure arithmetic operations on the
|
||||
// wrapper float types. Any arbitrary FF32 is considered possibly invalid (containing +-inf or NaN)
|
||||
// because it's not feasible to track validity (without running all operations in parallel with
|
||||
// unfast-math and thus negating any possible improvement). Float add/sub/mul/div/rem are permitted
|
||||
// on the possibly poison values (as documented by LLVM), producing transitively poison results,
|
||||
// then wrapped in FF32. Any other operations require the value to be not-poison in order to be
|
||||
// not-UB: anything like comparison/printing/conversion/casting/etc is done on frozen copies of
|
||||
// the data. Originating values that were valid will pass through the arithmetic and freezing
|
||||
// exactly as they are; invalid values will become poison through the arithmetic and then be frozen
|
||||
// to some unspecified value. The user may encounter garbage in such a case, but not in a way that
|
||||
// triggers UB.
|
||||
//
|
||||
// Prior art and references
|
||||
//
|
||||
// https://github.com/rust-lang/rust/issues/21690
|
||||
// Task for general purpose fast-math in rust lang. Discussions about the right approach
|
||||
// and generalizability, including whether it should be type-based or annotation based. fast_fp
|
||||
// uses types wrapping intrinsics because it's the only option available in user space, and gets
|
||||
// good optimizations useful in practice
|
||||
//
|
||||
// https://docs.rs/fast-floats/0.2.0/fast_floats/index.html
|
||||
// Another crate that wraps fast intrinsics in types. They didn't address poison propagation,
|
||||
// leaving constructors unsafe
|
||||
//
|
||||
// https://llvm.org/docs/LangRef.html#fast-math-flags
|
||||
// LLVM's documentation on fast-math
|
||||
//
|
||||
// https://llvm.org/docs/LangRef.html#poisonvalues
|
||||
// LLVM's documentation on poison
|
||||
//
|
||||
// https://github.com/rust-lang/unsafe-code-guidelines/issues/71
|
||||
// notes on the validity of primitive bit patterns
|
||||
|
||||
/// A wrapper over `f32` which enables fast-math optimizations.
|
||||
// TODO how best to document unspecified values, including witnessing possibly varying values
|
||||
#[derive(Clone, Copy)]
|
||||
#[repr(transparent)]
|
||||
pub struct FF32(MaybePoison<f32>);
|
||||
|
||||
impl FF32 {
|
||||
/// Create a new `FF32` instance from the given float value.
|
||||
///
|
||||
/// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must
|
||||
/// not produce infinite or NaN results. The output of any such operation is unspecified.
|
||||
#[inline(always)]
|
||||
pub const fn new(f: f32) -> Self {
|
||||
FF32(MaybePoison::new(f))
|
||||
}
|
||||
|
||||
/// Create a new `FF32` instance from the given float value, returning an error if the value is
|
||||
/// infinite or NaN.
|
||||
///
|
||||
/// Note that this check is **not sufficient** to avoid all unspecified outputs, because an
|
||||
/// operation could otherwise produce an invalid value with valid inputs (for example
|
||||
/// `ff32(1.0) / ff32(0.0)` is unspecified). Nevertheless, this check can be useful for
|
||||
/// limited best-effort validation.
|
||||
#[inline(always)]
|
||||
pub fn new_checked(f: f32) -> Result<Self, InvalidValueError> {
|
||||
// finite also checks for NaN
|
||||
if f.is_finite() {
|
||||
Ok(FF32::new(f))
|
||||
} else {
|
||||
Err(InvalidValueError { _priv: () })
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn freeze_f32(self) -> f32 {
|
||||
let inner = self.0.freeze();
|
||||
|
||||
// Safety:
|
||||
// every bit pattern is valid in float
|
||||
unsafe { inner.assume_init() }
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `FF32` instance from the given float value.
|
||||
///
|
||||
/// This is syntax sugar for constructing the `FF32` type, and equivalent to `FF32::new(f)`
|
||||
///
|
||||
/// The given value **MUST NOT** be infinite or NaN, and any operations involving this value must
|
||||
/// not produce infinite or NaN results. The output of any such operation is unspecified.
|
||||
#[inline(always)]
|
||||
pub fn ff32(f: f32) -> FF32 {
|
||||
// TODO maybe a feature flag to make this checked -> panic?
|
||||
FF32::new(f)
|
||||
}
|
||||
|
||||
impl ops::Add<FF32> for FF32 {
|
||||
type Output = Self;
|
||||
|
||||
#[inline(always)]
|
||||
fn add(self, other: FF32) -> Self {
|
||||
// Safety:
|
||||
//
|
||||
// - dereferencing the pointers is safe because every bit pattern is valid in float
|
||||
// primitives
|
||||
// - encountering poison operands is safe because LLVM's fast add documents not producing
|
||||
// UB on any inputs; it may produce poison on inf/nan (or if the sum is inf/nan), but these
|
||||
// are then wrapped in the MaybePoison to control propagation
|
||||
ff32(unsafe {
|
||||
fadd_fast(
|
||||
*self.0.maybe_poison().as_ptr(),
|
||||
*other.0.maybe_poison().as_ptr(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Branching on poison values is UB, so any operation that makes a bool is protected by freezing
|
||||
// the operands. This includes [Partial]Eq and [Partial]Ord.
|
||||
//
|
||||
// Note however that only value copies are frozen; the original values may still be poison, and
|
||||
// could even yield different concrete values on a subsequent freeze. This means that potentially
|
||||
// the values are not Eq/Ord consistent. Logical consistency is left as a responsibility of
|
||||
// the user, to maintain non inf/nan values, while the lib only ensures safety.
|
||||
|
||||
impl PartialEq<FF32> for FF32 {
|
||||
#[inline]
|
||||
fn eq(&self, other: &FF32) -> bool {
|
||||
let this = self.freeze_f32();
|
||||
let that = other.freeze_f32();
|
||||
|
||||
this == that
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for FF32 {}
|
||||
|
||||
impl PartialOrd<FF32> for FF32 {
|
||||
#[inline(always)]
|
||||
fn partial_cmp(&self, other: &FF32) -> Option<cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn lt(&self, other: &FF32) -> bool {
|
||||
self.freeze_f32() < other.freeze_f32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn le(&self, other: &FF32) -> bool {
|
||||
self.freeze_f32() <= other.freeze_f32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn gt(&self, other: &FF32) -> bool {
|
||||
self.freeze_f32() > other.freeze_f32()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn ge(&self, other: &FF32) -> bool {
|
||||
self.freeze_f32() >= other.freeze_f32()
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for FF32 {
|
||||
#[inline(always)]
|
||||
fn cmp(&self, other: &FF32) -> cmp::Ordering {
|
||||
let this = self.freeze_f32();
|
||||
let that = other.freeze_f32();
|
||||
|
||||
// Note NaNs are not supported (and would break everything else anyway) so we ignore them
|
||||
// and implement full Ord
|
||||
if this < that {
|
||||
cmp::Ordering::Less
|
||||
} else if this > that {
|
||||
cmp::Ordering::Greater
|
||||
} else {
|
||||
cmp::Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn clamp(self, min: FF32, max: FF32) -> FF32 {
|
||||
ff32(f32::clamp(
|
||||
self.freeze_f32(),
|
||||
min.freeze_f32(),
|
||||
max.freeze_f32(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FF32> for f32 {
|
||||
fn from(from: FF32) -> Self {
|
||||
// f32 is no longer in our API control, so we must stop poison propagation by freezing
|
||||
from.freeze_f32()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f32> for FF32 {
|
||||
fn from(from: f32) -> Self {
|
||||
ff32(from)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO FF64, macro everything, more ops, libm?
|
||||
74
src/poison.rs
Normal file
74
src/poison.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
use core::mem::MaybeUninit;
|
||||
|
||||
/// A wrapper used to model LLVM's [poison
|
||||
/// values](https://llvm.org/docs/LangRef.html#poisonvalues)
|
||||
#[derive(Copy)]
|
||||
#[repr(transparent)]
|
||||
pub(crate) struct MaybePoison<T>(MaybeUninit<T>);
|
||||
|
||||
impl<T: Copy> Clone for MaybePoison<T> {
|
||||
#[inline(always)]
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> MaybePoison<T> {
|
||||
#[inline(always)]
|
||||
pub(crate) const fn new(t: T) -> Self {
|
||||
MaybePoison(MaybeUninit::new(t))
|
||||
}
|
||||
|
||||
/// Get the (possibly poison) value from this instance.
|
||||
///
|
||||
/// The compiler may relax poison values to undefined values. That means, among other
|
||||
/// consequences, that calls to this function from copies of the same value could manifest
|
||||
/// different return values. Poison values are also transitive: an instruction that depends on
|
||||
/// a poison value, produces a poison value itself.
|
||||
///
|
||||
/// Propogation of poison values can be stopped using [`freeze`](MaybePoison::freeze)
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// It is UB to use a poison value as an operand to an instruction where _any_ of the operand's
|
||||
/// values trigger UB. This includes, for example, use as the divisor in integer division, or
|
||||
/// as the condition of a branch.
|
||||
///
|
||||
/// See more examples and explanations in the [LLVM
|
||||
/// documentation](https://llvm.org/docs/LangRef.html#poisonvalues)
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn maybe_poison(self) -> MaybeUninit<T> {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Freeze the poisoned value into a concrete (but arbitrary) value.
|
||||
///
|
||||
/// Note that the value may not be a valid representation of T, so the return type is still
|
||||
/// unsafe to dereference unless T is valid with any representation.
|
||||
#[inline(always)]
|
||||
pub(crate) fn freeze(self) -> MaybeUninit<T> {
|
||||
// As of this writing, rust does not have any intrinsic to call LLVM's freeze instruction.
|
||||
// Instead, we do the next best thing by tricking the compiler into de-optimizing poison
|
||||
// values by introducing inline assembly. This is the same technique used by
|
||||
// `core::hint::black_box` and (the unmerged) https://github.com/rust-lang/rust/pull/58363.
|
||||
// We cannot use black_box directly, however, as it is documented as only a best-effort
|
||||
// hint, and could in theory be changed in the future.
|
||||
|
||||
// Safety:
|
||||
//
|
||||
// - The poison value will no longer be poisoned, its safety restrictions no longer apply
|
||||
// - The asm macro emits no actual assembly, there's nothing to be unsafe
|
||||
unsafe {
|
||||
let inner = self.maybe_poison();
|
||||
// There is no actual assembly, it's just a trick to restrict the compiler from
|
||||
// optimizing around poison values. However the asm macro requires the format
|
||||
// string to capture all inputs, so put the captured pointer in an assembly comment.
|
||||
// The possibly poison value is labelled as input to the assembly block by providing a
|
||||
// pointer to the value; the compiler then must assume that anything could be done with
|
||||
// that pointer (e.g. reading and writing the value) so the compiler must materialize
|
||||
// a concrete (though arbitrary) value before the assembly
|
||||
asm!("/* {0} */", in(reg) inner.as_ptr(), options(nostack, preserves_flags));
|
||||
inner
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user