rp: hook up softfloat rom intrinsics

rp-hal has done this very well already, so we'll just copy their entire impl again. only div.rs needed some massaging because our sio access works a little differently, everything else worked as is.
2023-04-19 01:57:37 +02:00
parent a673b9aa29
commit fdd6e08ed6
11 changed files with 1108 additions and 3 deletions
--- a/embassy-rp/src/float/add_sub.rs
+++ b/embassy-rp/src/float/add_sub.rs
@@ -0,0 +1,92 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/add_sub.rs
 use super::{Float, Int};
 use crate::rom_data;
 trait ROMAdd {
    fn rom_add(self, b: Self) -> Self;
 }
 impl ROMAdd for f32 {
    fn rom_add(self, b: Self) -> Self {
        rom_data::float_funcs::fadd(self, b)
    }
 }
 impl ROMAdd for f64 {
    fn rom_add(self, b: Self) -> Self {
        rom_data::double_funcs::dadd(self, b)
    }
 }
 fn add<F: Float + ROMAdd>(a: F, b: F) -> F {
    if a.is_not_finite() {
        if b.is_not_finite() {
            let class_a = a.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK);
            let class_b = b.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK);
            if class_a == F::Int::ZERO && class_b == F::Int::ZERO {
                // inf + inf = inf
                return a;
            }
            if class_a == F::SIGN_MASK && class_b == F::SIGN_MASK {
                // -inf + (-inf) = -inf
                return a;
            }
            // Sign mismatch, or either is NaN already
            return F::NAN;
        }
        // [-]inf/NaN + X = [-]inf/NaN
        return a;
    }
    if b.is_not_finite() {
        // X + [-]inf/NaN = [-]inf/NaN
        return b;
    }
    a.rom_add(b)
 }
 intrinsics! {
    #[alias = __addsf3vfp]
    #[aeabi = __aeabi_fadd]
    extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
        add(a, b)
    }
    #[bootrom_v2]
    #[alias = __adddf3vfp]
    #[aeabi = __aeabi_dadd]
    extern "C" fn __adddf3(a: f64, b: f64) -> f64 {
        add(a, b)
    }
    // The ROM just implements subtraction the same way, so just do it here
    // and save the work of implementing more complicated NaN/inf handling.
    #[alias = __subsf3vfp]
    #[aeabi = __aeabi_fsub]
    extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
        add(a, -b)
    }
    #[bootrom_v2]
    #[alias = __subdf3vfp]
    #[aeabi = __aeabi_dsub]
    extern "C" fn __subdf3(a: f64, b: f64) -> f64 {
        add(a, -b)
    }
    extern "aapcs" fn __aeabi_frsub(a: f32, b: f32) -> f32 {
        add(b, -a)
    }
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_drsub(a: f64, b: f64) -> f64 {
        add(b, -a)
    }
 }
--- a/embassy-rp/src/float/cmp.rs
+++ b/embassy-rp/src/float/cmp.rs
@@ -0,0 +1,201 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/cmp.rs
 use super::Float;
 use crate::rom_data;
 trait ROMCmp {
    fn rom_cmp(self, b: Self) -> i32;
 }
 impl ROMCmp for f32 {
    fn rom_cmp(self, b: Self) -> i32 {
        rom_data::float_funcs::fcmp(self, b)
    }
 }
 impl ROMCmp for f64 {
    fn rom_cmp(self, b: Self) -> i32 {
        rom_data::double_funcs::dcmp(self, b)
    }
 }
 fn le_abi<F: Float + ROMCmp>(a: F, b: F) -> i32 {
    if a.is_nan() || b.is_nan() {
        1
    } else {
        a.rom_cmp(b)
    }
 }
 fn ge_abi<F: Float + ROMCmp>(a: F, b: F) -> i32 {
    if a.is_nan() || b.is_nan() {
        -1
    } else {
        a.rom_cmp(b)
    }
 }
 intrinsics! {
    #[slower_than_default]
    #[bootrom_v2]
    #[alias = __eqsf2, __ltsf2, __nesf2]
    extern "C" fn __lesf2(a: f32, b: f32) -> i32 {
        le_abi(a, b)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[alias = __eqdf2, __ltdf2, __nedf2]
    extern "C" fn __ledf2(a: f64, b: f64) -> i32 {
        le_abi(a, b)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[alias = __gtsf2]
    extern "C" fn __gesf2(a: f32, b: f32) -> i32 {
        ge_abi(a, b)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[alias = __gtdf2]
    extern "C" fn __gedf2(a: f64, b: f64) -> i32 {
        ge_abi(a, b)
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 {
        (le_abi(a, b) <= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 {
        (ge_abi(a, b) >= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 {
        (le_abi(a, b) == 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 {
        (le_abi(a, b) < 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 {
        (ge_abi(a, b) > 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 {
        (le_abi(a, b) <= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 {
        (ge_abi(a, b) >= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 {
        (le_abi(a, b) == 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 {
        (le_abi(a, b) < 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 {
        (ge_abi(a, b) > 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __gesf2vfp(a: f32, b: f32) -> i32 {
        (ge_abi(a, b) >= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __gedf2vfp(a: f64, b: f64) -> i32 {
        (ge_abi(a, b) >= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __gtsf2vfp(a: f32, b: f32) -> i32 {
        (ge_abi(a, b) > 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __gtdf2vfp(a: f64, b: f64) -> i32 {
        (ge_abi(a, b) > 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __ltsf2vfp(a: f32, b: f32) -> i32 {
        (le_abi(a, b) < 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __ltdf2vfp(a: f64, b: f64) -> i32 {
        (le_abi(a, b) < 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __lesf2vfp(a: f32, b: f32) -> i32 {
        (le_abi(a, b) <= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __ledf2vfp(a: f64, b: f64) -> i32 {
        (le_abi(a, b) <= 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __nesf2vfp(a: f32, b: f32) -> i32 {
        (le_abi(a, b) != 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __nedf2vfp(a: f64, b: f64) -> i32 {
        (le_abi(a, b) != 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __eqsf2vfp(a: f32, b: f32) -> i32 {
        (le_abi(a, b) == 0) as i32
    }
    #[slower_than_default]
    #[bootrom_v2]
    extern "C" fn __eqdf2vfp(a: f64, b: f64) -> i32 {
        (le_abi(a, b) == 0) as i32
    }
 }
--- a/embassy-rp/src/float/conv.rs
+++ b/embassy-rp/src/float/conv.rs
@@ -0,0 +1,157 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs
 use super::Float;
 use crate::rom_data;
 // Some of these are also not connected in the Pico SDK.  This is probably
 // because the ROM version actually does a fixed point conversion, just with
 // the fractional width set to zero.
 intrinsics! {
    // Not connected in the Pico SDK
    #[slower_than_default]
    #[aeabi = __aeabi_i2f]
    extern "C" fn __floatsisf(i: i32) -> f32 {
        rom_data::float_funcs::int_to_float(i)
    }
    // Not connected in the Pico SDK
    #[slower_than_default]
    #[aeabi = __aeabi_i2d]
    extern "C" fn __floatsidf(i: i32) -> f64 {
        rom_data::double_funcs::int_to_double(i)
    }
    // Questionable gain
    #[aeabi = __aeabi_l2f]
    extern "C" fn __floatdisf(i: i64) -> f32 {
        rom_data::float_funcs::int64_to_float(i)
    }
    #[bootrom_v2]
    #[aeabi = __aeabi_l2d]
    extern "C" fn __floatdidf(i: i64) -> f64 {
        rom_data::double_funcs::int64_to_double(i)
    }
    // Not connected in the Pico SDK
    #[slower_than_default]
    #[aeabi = __aeabi_ui2f]
    extern "C" fn __floatunsisf(i: u32) -> f32 {
        rom_data::float_funcs::uint_to_float(i)
    }
    // Questionable gain
    #[bootrom_v2]
    #[aeabi = __aeabi_ui2d]
    extern "C" fn __floatunsidf(i: u32) -> f64 {
        rom_data::double_funcs::uint_to_double(i)
    }
    // Questionable gain
    #[bootrom_v2]
    #[aeabi = __aeabi_ul2f]
    extern "C" fn __floatundisf(i: u64) -> f32 {
        rom_data::float_funcs::uint64_to_float(i)
    }
    #[bootrom_v2]
    #[aeabi = __aeabi_ul2d]
    extern "C" fn __floatundidf(i: u64) -> f64 {
        rom_data::double_funcs::uint64_to_double(i)
    }
    // The Pico SDK does some optimization here (e.x. fast paths for zero and
    // one), but we can just directly connect it.
    #[aeabi = __aeabi_f2iz]
    extern "C" fn __fixsfsi(f: f32) -> i32 {
        rom_data::float_funcs::float_to_int(f)
    }
    #[bootrom_v2]
    #[aeabi = __aeabi_f2lz]
    extern "C" fn __fixsfdi(f: f32) -> i64 {
        rom_data::float_funcs::float_to_int64(f)
    }
    // Not connected in the Pico SDK
    #[slower_than_default]
    #[bootrom_v2]
    #[aeabi = __aeabi_d2iz]
    extern "C" fn __fixdfsi(f: f64) -> i32 {
        rom_data::double_funcs::double_to_int(f)
    }
    // Like with the 32 bit version, there's optimization that we just
    // skip.
    #[bootrom_v2]
    #[aeabi = __aeabi_d2lz]
    extern "C" fn __fixdfdi(f: f64) -> i64 {
        rom_data::double_funcs::double_to_int64(f)
    }
    #[slower_than_default]
    #[aeabi = __aeabi_f2uiz]
    extern "C" fn __fixunssfsi(f: f32) -> u32 {
        rom_data::float_funcs::float_to_uint(f)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[aeabi = __aeabi_f2ulz]
    extern "C" fn __fixunssfdi(f: f32) -> u64 {
        rom_data::float_funcs::float_to_uint64(f)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[aeabi = __aeabi_d2uiz]
    extern "C" fn __fixunsdfsi(f: f64) -> u32 {
        rom_data::double_funcs::double_to_uint(f)
    }
    #[slower_than_default]
    #[bootrom_v2]
    #[aeabi = __aeabi_d2ulz]
    extern "C" fn __fixunsdfdi(f: f64) -> u64 {
        rom_data::double_funcs::double_to_uint64(f)
    }
    #[bootrom_v2]
    #[alias = __extendsfdf2vfp]
    #[aeabi = __aeabi_f2d]
    extern "C" fn  __extendsfdf2(f: f32) -> f64 {
        if f.is_not_finite() {
            return f64::from_repr(
                // Not finite
                f64::EXPONENT_MASK |
                // Preserve NaN or inf
                ((f.repr() & f32::SIGNIFICAND_MASK) as u64) |
                // Preserve sign
                ((f.repr() & f32::SIGN_MASK) as u64) << (f64::BITS-f32::BITS)
            );
        }
        rom_data::float_funcs::float_to_double(f)
    }
    #[bootrom_v2]
    #[alias = __truncdfsf2vfp]
    #[aeabi = __aeabi_d2f]
    extern "C" fn __truncdfsf2(f: f64) -> f32 {
        if f.is_not_finite() {
            let mut repr: u32 =
                // Not finite
                f32::EXPONENT_MASK |
                // Preserve sign
                ((f.repr() & f64::SIGN_MASK) >> (f64::BITS-f32::BITS)) as u32;
            // Set NaN
            if  (f.repr() & f64::SIGNIFICAND_MASK) != 0 {
                repr |= 1;
            }
            return f32::from_repr(repr);
        }
        rom_data::double_funcs::double_to_float(f)
    }
 }
--- a/embassy-rp/src/float/div.rs
+++ b/embassy-rp/src/float/div.rs
@@ -0,0 +1,141 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs
 use super::Float;
 use crate::rom_data;
 // Make sure this stays as a separate call, because when it's inlined the
 // compiler will move the save of the registers used to contain the divider
 // state into the function prologue.  That save and restore (push/pop) takes
 // longer than the actual division, so doing it in the common case where
 // they are not required wastes a lot of time.
 #[inline(never)]
 #[cold]
 fn save_divider_and_call<F, R>(f: F) -> R
 where
    F: FnOnce() -> R,
 {
    let sio = rp_pac::SIO;
    unsafe {
        // Since we can't save the signed-ness of the calculation, we have to make
        // sure that there's at least an 8 cycle delay before we read the result.
        // The Pico SDK ensures this by using a 6 cycle push and two 1 cycle reads.
        // Since we can't be sure the Rust implementation will optimize to the same,
        // just use an explicit wait.
        while !sio.div().csr().read().ready() {}
        // Read the quotient last, since that's what clears the dirty flag
        let dividend = sio.div().udividend().read();
        let divisor = sio.div().udivisor().read();
        let remainder = sio.div().remainder().read();
        let quotient = sio.div().quotient().read();
        // If we get interrupted here (before a write sets the DIRTY flag) its fine, since
        // we have the full state, so the interruptor doesn't have to restore it.  Once the
        // write happens and the DIRTY flag is set, the interruptor becomes responsible for
        // restoring our state.
        let result = f();
        // If we are interrupted here, then the interruptor will start an incorrect calculation
        // using a wrong divisor, but we'll restore the divisor and result ourselves correctly.
        // This sets DIRTY, so any interruptor will save the state.
        sio.div().udividend().write_value(dividend);
        // If we are interrupted here, the the interruptor may start the calculation using
        // incorrectly signed inputs, but we'll restore the result ourselves.
        // This sets DIRTY, so any interruptor will save the state.
        sio.div().udivisor().write_value(divisor);
        // If we are interrupted here, the interruptor will have restored everything but the
        // quotient may be wrongly signed.  If the calculation started by the above writes is
        // still ongoing it is stopped, so it won't replace the result we're restoring.
        // DIRTY and READY set, but only DIRTY matters to make the interruptor save the state.
        sio.div().remainder().write_value(remainder);
        // State fully restored after the quotient write.  This sets both DIRTY and READY, so
        // whatever we may have interrupted can read the result.
        sio.div().quotient().write_value(quotient);
        result
    }
 }
 fn save_divider<F, R>(f: F) -> R
 where
    F: FnOnce() -> R,
 {
    let sio = rp_pac::SIO;
    if unsafe { !sio.div().csr().read().dirty() } {
        // Not dirty, so nothing is waiting for the calculation.  So we can just
        // issue it directly without a save/restore.
        f()
    } else {
        save_divider_and_call(f)
    }
 }
 trait ROMDiv {
    fn rom_div(self, b: Self) -> Self;
 }
 impl ROMDiv for f32 {
    fn rom_div(self, b: Self) -> Self {
        // ROM implementation uses the hardware divider, so we have to save it
        save_divider(|| rom_data::float_funcs::fdiv(self, b))
    }
 }
 impl ROMDiv for f64 {
    fn rom_div(self, b: Self) -> Self {
        // ROM implementation uses the hardware divider, so we have to save it
        save_divider(|| rom_data::double_funcs::ddiv(self, b))
    }
 }
 fn div<F: Float + ROMDiv>(a: F, b: F) -> F {
    if a.is_not_finite() {
        if b.is_not_finite() {
            // inf/NaN / inf/NaN = NaN
            return F::NAN;
        }
        if b.is_zero() {
            // inf/NaN / 0 = NaN
            return F::NAN;
        }
        return if b.is_sign_negative() {
            // [+/-]inf/NaN / (-X) = [-/+]inf/NaN
            a.negate()
        } else {
            // [-]inf/NaN / X = [-]inf/NaN
            a
        };
    }
    if b.is_nan() {
        // X / NaN = NaN
        return b;
    }
    // ROM handles X / 0 = [-]inf and X / [-]inf = [-]0, so we only
    // need to catch 0 / 0
    if b.is_zero() && a.is_zero() {
        return F::NAN;
    }
    a.rom_div(b)
 }
 intrinsics! {
    #[alias = __divsf3vfp]
    #[aeabi = __aeabi_fdiv]
    extern "C" fn __divsf3(a: f32, b: f32) -> f32 {
        div(a, b)
    }
    #[bootrom_v2]
    #[alias = __divdf3vfp]
    #[aeabi = __aeabi_ddiv]
    extern "C" fn __divdf3(a: f64, b: f64) -> f64 {
        div(a, b)
    }
 }
--- a/embassy-rp/src/float/functions.rs
+++ b/embassy-rp/src/float/functions.rs
@@ -0,0 +1,239 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/functions.rs
 use crate::float::{Float, Int};
 use crate::rom_data;
 trait ROMFunctions {
    fn sqrt(self) -> Self;
    fn ln(self) -> Self;
    fn exp(self) -> Self;
    fn sin(self) -> Self;
    fn cos(self) -> Self;
    fn tan(self) -> Self;
    fn atan2(self, y: Self) -> Self;
    fn to_trig_range(self) -> Self;
 }
 impl ROMFunctions for f32 {
    fn sqrt(self) -> Self {
        rom_data::float_funcs::fsqrt(self)
    }
    fn ln(self) -> Self {
        rom_data::float_funcs::fln(self)
    }
    fn exp(self) -> Self {
        rom_data::float_funcs::fexp(self)
    }
    fn sin(self) -> Self {
        rom_data::float_funcs::fsin(self)
    }
    fn cos(self) -> Self {
        rom_data::float_funcs::fcos(self)
    }
    fn tan(self) -> Self {
        rom_data::float_funcs::ftan(self)
    }
    fn atan2(self, y: Self) -> Self {
        rom_data::float_funcs::fatan2(self, y)
    }
    fn to_trig_range(self) -> Self {
        // -128 < X < 128, logic from the Pico SDK
        let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS;
        if exponent < 134 {
            self
        } else {
            self % (core::f32::consts::PI * 2.0)
        }
    }
 }
 impl ROMFunctions for f64 {
    fn sqrt(self) -> Self {
        rom_data::double_funcs::dsqrt(self)
    }
    fn ln(self) -> Self {
        rom_data::double_funcs::dln(self)
    }
    fn exp(self) -> Self {
        rom_data::double_funcs::dexp(self)
    }
    fn sin(self) -> Self {
        rom_data::double_funcs::dsin(self)
    }
    fn cos(self) -> Self {
        rom_data::double_funcs::dcos(self)
    }
    fn tan(self) -> Self {
        rom_data::double_funcs::dtan(self)
    }
    fn atan2(self, y: Self) -> Self {
        rom_data::double_funcs::datan2(self, y)
    }
    fn to_trig_range(self) -> Self {
        // -1024 < X < 1024, logic from the Pico SDK
        let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS;
        if exponent < 1033 {
            self
        } else {
            self % (core::f64::consts::PI * 2.0)
        }
    }
 }
 fn is_negative_nonzero_or_nan<F: Float>(f: F) -> bool {
    let repr = f.repr();
    if (repr & F::SIGN_MASK) != F::Int::ZERO {
        // Negative, so anything other than exactly zero
        return (repr & (!F::SIGN_MASK)) != F::Int::ZERO;
    }
    // NaN
    (repr & (F::EXPONENT_MASK | F::SIGNIFICAND_MASK)) > F::EXPONENT_MASK
 }
 fn sqrt<F: Float + ROMFunctions>(f: F) -> F {
    if is_negative_nonzero_or_nan(f) {
        F::NAN
    } else {
        f.sqrt()
    }
 }
 fn ln<F: Float + ROMFunctions>(f: F) -> F {
    if is_negative_nonzero_or_nan(f) {
        F::NAN
    } else {
        f.ln()
    }
 }
 fn exp<F: Float + ROMFunctions>(f: F) -> F {
    if f.is_nan() {
        F::NAN
    } else {
        f.exp()
    }
 }
 fn sin<F: Float + ROMFunctions>(f: F) -> F {
    if f.is_not_finite() {
        F::NAN
    } else {
        f.to_trig_range().sin()
    }
 }
 fn cos<F: Float + ROMFunctions>(f: F) -> F {
    if f.is_not_finite() {
        F::NAN
    } else {
        f.to_trig_range().cos()
    }
 }
 fn tan<F: Float + ROMFunctions>(f: F) -> F {
    if f.is_not_finite() {
        F::NAN
    } else {
        f.to_trig_range().tan()
    }
 }
 fn atan2<F: Float + ROMFunctions>(x: F, y: F) -> F {
    if x.is_nan() || y.is_nan() {
        F::NAN
    } else {
        x.to_trig_range().atan2(y)
    }
 }
 // Name collisions
 mod intrinsics {
    intrinsics! {
        extern "C" fn sqrtf(f: f32) -> f32 {
            super::sqrt(f)
        }
        #[bootrom_v2]
        extern "C" fn sqrt(f: f64) -> f64 {
            super::sqrt(f)
        }
        extern "C" fn logf(f: f32) -> f32 {
            super::ln(f)
        }
        #[bootrom_v2]
        extern "C" fn log(f: f64) -> f64 {
            super::ln(f)
        }
        extern "C" fn expf(f: f32) -> f32 {
            super::exp(f)
        }
        #[bootrom_v2]
        extern "C" fn exp(f: f64) -> f64 {
            super::exp(f)
        }
        #[slower_than_default]
        extern "C" fn sinf(f: f32) -> f32 {
            super::sin(f)
        }
        #[slower_than_default]
        #[bootrom_v2]
        extern "C" fn sin(f: f64) -> f64 {
            super::sin(f)
        }
        #[slower_than_default]
        extern "C" fn cosf(f: f32) -> f32 {
            super::cos(f)
        }
        #[slower_than_default]
        #[bootrom_v2]
        extern "C" fn cos(f: f64) -> f64 {
            super::cos(f)
        }
        #[slower_than_default]
        extern "C" fn tanf(f: f32) -> f32 {
            super::tan(f)
        }
        #[slower_than_default]
        #[bootrom_v2]
        extern "C" fn tan(f: f64) -> f64 {
            super::tan(f)
        }
        // Questionable gain
        #[bootrom_v2]
        extern "C" fn atan2f(a: f32, b: f32) -> f32 {
            super::atan2(a, b)
        }
        // Questionable gain
        #[bootrom_v2]
        extern "C" fn atan2(a: f64, b: f64) -> f64 {
            super::atan2(a, b)
        }
    }
 }
--- a/embassy-rp/src/float/mod.rs
+++ b/embassy-rp/src/float/mod.rs
@@ -0,0 +1,149 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mod.rs
 use core::ops;
 // Borrowed and simplified from compiler-builtins so we can use bit ops
 // on floating point without macro soup.
 pub(crate) trait Int:
    Copy
    + core::fmt::Debug
    + PartialEq
    + PartialOrd
    + ops::AddAssign
    + ops::SubAssign
    + ops::BitAndAssign
    + ops::BitOrAssign
    + ops::BitXorAssign
    + ops::ShlAssign<i32>
    + ops::ShrAssign<u32>
    + ops::Add<Output = Self>
    + ops::Sub<Output = Self>
    + ops::Div<Output = Self>
    + ops::Shl<u32, Output = Self>
    + ops::Shr<u32, Output = Self>
    + ops::BitOr<Output = Self>
    + ops::BitXor<Output = Self>
    + ops::BitAnd<Output = Self>
    + ops::Not<Output = Self>
 {
    const ZERO: Self;
 }
 macro_rules! int_impl {
    ($ty:ty) => {
        impl Int for $ty {
            const ZERO: Self = 0;
        }
    };
 }
 int_impl!(u32);
 int_impl!(u64);
 pub(crate) trait Float:
    Copy
    + core::fmt::Debug
    + PartialEq
    + PartialOrd
    + ops::AddAssign
    + ops::MulAssign
    + ops::Add<Output = Self>
    + ops::Sub<Output = Self>
    + ops::Div<Output = Self>
    + ops::Rem<Output = Self>
 {
    /// A uint of the same with as the float
    type Int: Int;
    /// NaN representation for the float
    const NAN: Self;
    /// The bitwidth of the float type
    const BITS: u32;
    /// The bitwidth of the significand
    const SIGNIFICAND_BITS: u32;
    /// A mask for the sign bit
    const SIGN_MASK: Self::Int;
    /// A mask for the significand
    const SIGNIFICAND_MASK: Self::Int;
    /// A mask for the exponent
    const EXPONENT_MASK: Self::Int;
    /// Returns `self` transmuted to `Self::Int`
    fn repr(self) -> Self::Int;
    /// Returns a `Self::Int` transmuted back to `Self`
    fn from_repr(a: Self::Int) -> Self;
    /// Return a sign swapped `self`
    fn negate(self) -> Self;
    /// Returns true if `self` is either NaN or infinity
    fn is_not_finite(self) -> bool {
        (self.repr() & Self::EXPONENT_MASK) == Self::EXPONENT_MASK
    }
    /// Returns true if `self` is infinity
    fn is_infinity(self) -> bool {
        (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) == Self::EXPONENT_MASK
    }
    /// Returns true if `self is NaN
    fn is_nan(self) -> bool {
        (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) > Self::EXPONENT_MASK
    }
    /// Returns true if `self` is negative
    fn is_sign_negative(self) -> bool {
        (self.repr() & Self::SIGN_MASK) != Self::Int::ZERO
    }
    /// Returns true if `self` is zero (either sign)
    fn is_zero(self) -> bool {
        (self.repr() & (Self::SIGNIFICAND_MASK | Self::EXPONENT_MASK)) == Self::Int::ZERO
    }
 }
 macro_rules! float_impl {
    ($ty:ident, $ity:ident, $bits:expr, $significand_bits:expr) => {
        impl Float for $ty {
            type Int = $ity;
            const NAN: Self = <$ty>::NAN;
            const BITS: u32 = $bits;
            const SIGNIFICAND_BITS: u32 = $significand_bits;
            const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1);
            const SIGNIFICAND_MASK: Self::Int = (1 << Self::SIGNIFICAND_BITS) - 1;
            const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK);
            fn repr(self) -> Self::Int {
                self.to_bits()
            }
            fn from_repr(a: Self::Int) -> Self {
                Self::from_bits(a)
            }
            fn negate(self) -> Self {
                -self
            }
        }
    };
 }
 float_impl!(f32, u32, 32, 23);
 float_impl!(f64, u64, 64, 52);
 mod add_sub;
 mod cmp;
 mod conv;
 mod div;
 mod functions;
 mod mul;
--- a/embassy-rp/src/float/mul.rs
+++ b/embassy-rp/src/float/mul.rs
@@ -0,0 +1,70 @@
 // Credit: taken from `rp-hal` (also licensed Apache+MIT)
 // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mul.rs
 use super::Float;
 use crate::rom_data;
 trait ROMMul {
    fn rom_mul(self, b: Self) -> Self;
 }
 impl ROMMul for f32 {
    fn rom_mul(self, b: Self) -> Self {
        rom_data::float_funcs::fmul(self, b)
    }
 }
 impl ROMMul for f64 {
    fn rom_mul(self, b: Self) -> Self {
        rom_data::double_funcs::dmul(self, b)
    }
 }
 fn mul<F: Float + ROMMul>(a: F, b: F) -> F {
    if a.is_not_finite() {
        if b.is_zero() {
            // [-]inf/NaN * 0 = NaN
            return F::NAN;
        }
        return if b.is_sign_negative() {
            // [+/-]inf/NaN * (-X) = [-/+]inf/NaN
            a.negate()
        } else {
            // [-]inf/NaN * X = [-]inf/NaN
            a
        };
    }
    if b.is_not_finite() {
        if a.is_zero() {
            // 0 * [-]inf/NaN = NaN
            return F::NAN;
        }
        return if b.is_sign_negative() {
            // (-X) * [+/-]inf/NaN = [-/+]inf/NaN
            b.negate()
        } else {
            // X * [-]inf/NaN = [-]inf/NaN
            b
        };
    }
    a.rom_mul(b)
 }
 intrinsics! {
    #[alias = __mulsf3vfp]
    #[aeabi = __aeabi_fmul]
    extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
        mul(a, b)
    }
    #[bootrom_v2]
    #[alias = __muldf3vfp]
    #[aeabi = __aeabi_dmul]
    extern "C" fn __muldf3(a: f64, b: f64) -> f64 {
        mul(a, b)
    }
 }
--- a/embassy-rp/src/lib.rs
+++ b/embassy-rp/src/lib.rs
@@ -12,6 +12,7 @@ mod intrinsics;
 pub mod adc;
 pub mod dma;
 mod float;
 pub mod gpio;
 pub mod i2c;
 pub mod interrupt;
--- a/tests/rp/.cargo/config.toml
+++ b/tests/rp/.cargo/config.toml
@@ -1,6 +1,8 @@
 [unstable]
-build-std = ["core"]
+# enabling these breaks the float tests during linking, with intrinsics
-build-std-features = ["panic_immediate_abort"]
+# duplicated between embassy-rp and compilter_builtins
 #build-std = ["core"]
 #build-std-features = ["panic_immediate_abort"]
 [target.'cfg(all(target_arch = "arm", target_os = "none"))']
 #runner = "teleprobe client run --target rpi-pico --elf"
--- a/tests/rp/Cargo.toml
+++ b/tests/rp/Cargo.toml
@@ -8,7 +8,7 @@ license = "MIT OR Apache-2.0"
 embassy-sync = { version = "0.2.0", path = "../../embassy-sync", features = ["defmt"] }
 embassy-executor = { version = "0.1.0", path = "../../embassy-executor", features = ["arch-cortex-m", "executor-thread", "defmt", "integrated-timers"] }
 embassy-time = { version = "0.1.0", path = "../../embassy-time", features = ["defmt"] }
-embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl"]  }
+embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl", "intrinsics", "rom-v2-intrinsics"]  }
 embassy-futures = { version = "0.1.0", path = "../../embassy-futures" }
 defmt = "0.3.0"
--- a/tests/rp/src/bin/float.rs
+++ b/tests/rp/src/bin/float.rs
@@ -0,0 +1,53 @@
 #![no_std]
 #![no_main]
 #![feature(type_alias_impl_trait)]
 use defmt::*;
 use embassy_executor::Spawner;
 use embassy_rp::pac;
 use embassy_time::{Duration, Timer};
 use {defmt_rtt as _, panic_probe as _};
 #[embassy_executor::main]
 async fn main(_spawner: Spawner) {
    embassy_rp::init(Default::default());
    info!("Hello World!");
    const PI_F: f32 = 3.1415926535f32;
    const PI_D: f64 = 3.14159265358979323846f64;
    unsafe {
        pac::BUSCTRL
            .perfsel(0)
            .write(|r| r.set_perfsel(pac::busctrl::vals::Perfsel::ROM));
    }
    for i in 0..=360 {
        let rad_f = (i as f32) * PI_F / 180.0;
        info!(
            "{}° float: {=f32} / {=f32} / {=f32} / {=f32}",
            i,
            rad_f,
            rad_f - PI_F,
            rad_f + PI_F,
            rad_f % PI_F
        );
        let rad_d = (i as f64) * PI_D / 180.0;
        info!(
            "{}° double: {=f64} / {=f64} / {=f64} / {=f64}",
            i,
            rad_d,
            rad_d - PI_D,
            rad_d + PI_D,
            rad_d % PI_D
        );
        Timer::after(Duration::from_millis(10)).await;
    }
    let rom_accesses = unsafe { pac::BUSCTRL.perfctr(0).read().perfctr() };
    // every float operation used here uses at least 10 cycles
    defmt::assert!(rom_accesses >= 360 * 12 * 10);
    info!("Test OK");
    cortex_m::asm::bkpt();
 }