Merge branch 'embassy-rs:master' into embassy-boot-stable

2023-04-20 10:29:16 +02:00
parent 7d64de153f 9b51c8f4d4
commit 43c20dbe65
15 changed files with 1214 additions and 128 deletions
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@@ -61,6 +61,7 @@ jobs:
          mkdir crates
          builder ./embassy-boot/boot crates/embassy-boot/git.zup
          builder ./embassy-boot/nrf crates/embassy-boot-nrf/git.zup
+          builder ./embassy-boot/rp crates/embassy-boot-rp/git.zup
          builder ./embassy-boot/stm32 crates/embassy-boot-stm32/git.zup
          builder ./embassy-cortex-m crates/embassy-cortex-m/git.zup
          builder ./embassy-embedded-hal crates/embassy-embedded-hal/git.zup
@@ -84,5 +85,3 @@ jobs:
          echo "${{secrets.KUBECONFIG}}" > ~/.kube/config
          POD=$(kubectl -n embassy get po -l app=docserver -o jsonpath={.items[0].metadata.name})
          kubectl cp crates $POD:/data
-
-          
--- a/embassy-boot/boot/README.md
+++ b/embassy-boot/boot/README.md
@@ -13,6 +13,7 @@ By design, the bootloader does not provide any network capabilities. Networking
 The bootloader supports different hardware in separate crates:

 * `embassy-boot-nrf` - for the nRF microcontrollers.
+* `embassy-boot-rp` - for the RP2040 microcontrollers.
 * `embassy-boot-stm32` - for the STM32 microcontrollers.

 ## Minimum supported Rust version (MSRV)
--- a/embassy-rp/src/float/add_sub.rs
+++ b/embassy-rp/src/float/add_sub.rs
@@ -0,0 +1,92 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/add_sub.rs
+
+use super::{Float, Int};
+use crate::rom_data;
+
+trait ROMAdd {
+    fn rom_add(self, b: Self) -> Self;
+}
+
+impl ROMAdd for f32 {
+    fn rom_add(self, b: Self) -> Self {
+        rom_data::float_funcs::fadd(self, b)
+    }
+}
+
+impl ROMAdd for f64 {
+    fn rom_add(self, b: Self) -> Self {
+        rom_data::double_funcs::dadd(self, b)
+    }
+}
+
+fn add<F: Float + ROMAdd>(a: F, b: F) -> F {
+    if a.is_not_finite() {
+        if b.is_not_finite() {
+            let class_a = a.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK);
+            let class_b = b.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK);
+
+            if class_a == F::Int::ZERO && class_b == F::Int::ZERO {
+                // inf + inf = inf
+                return a;
+            }
+            if class_a == F::SIGN_MASK && class_b == F::SIGN_MASK {
+                // -inf + (-inf) = -inf
+                return a;
+            }
+
+            // Sign mismatch, or either is NaN already
+            return F::NAN;
+        }
+
+        // [-]inf/NaN + X = [-]inf/NaN
+        return a;
+    }
+
+    if b.is_not_finite() {
+        // X + [-]inf/NaN = [-]inf/NaN
+        return b;
+    }
+
+    a.rom_add(b)
+}
+
+intrinsics! {
+    #[alias = __addsf3vfp]
+    #[aeabi = __aeabi_fadd]
+    extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
+        add(a, b)
+    }
+
+    #[bootrom_v2]
+    #[alias = __adddf3vfp]
+    #[aeabi = __aeabi_dadd]
+    extern "C" fn __adddf3(a: f64, b: f64) -> f64 {
+        add(a, b)
+    }
+
+    // The ROM just implements subtraction the same way, so just do it here
+    // and save the work of implementing more complicated NaN/inf handling.
+
+    #[alias = __subsf3vfp]
+    #[aeabi = __aeabi_fsub]
+    extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
+        add(a, -b)
+    }
+
+    #[bootrom_v2]
+    #[alias = __subdf3vfp]
+    #[aeabi = __aeabi_dsub]
+    extern "C" fn __subdf3(a: f64, b: f64) -> f64 {
+        add(a, -b)
+    }
+
+    extern "aapcs" fn __aeabi_frsub(a: f32, b: f32) -> f32 {
+        add(b, -a)
+    }
+
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_drsub(a: f64, b: f64) -> f64 {
+        add(b, -a)
+    }
+}
--- a/embassy-rp/src/float/cmp.rs
+++ b/embassy-rp/src/float/cmp.rs
@@ -0,0 +1,201 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/cmp.rs
+
+use super::Float;
+use crate::rom_data;
+
+trait ROMCmp {
+    fn rom_cmp(self, b: Self) -> i32;
+}
+
+impl ROMCmp for f32 {
+    fn rom_cmp(self, b: Self) -> i32 {
+        rom_data::float_funcs::fcmp(self, b)
+    }
+}
+
+impl ROMCmp for f64 {
+    fn rom_cmp(self, b: Self) -> i32 {
+        rom_data::double_funcs::dcmp(self, b)
+    }
+}
+
+fn le_abi<F: Float + ROMCmp>(a: F, b: F) -> i32 {
+    if a.is_nan() || b.is_nan() {
+        1
+    } else {
+        a.rom_cmp(b)
+    }
+}
+
+fn ge_abi<F: Float + ROMCmp>(a: F, b: F) -> i32 {
+    if a.is_nan() || b.is_nan() {
+        -1
+    } else {
+        a.rom_cmp(b)
+    }
+}
+
+intrinsics! {
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[alias = __eqsf2, __ltsf2, __nesf2]
+    extern "C" fn __lesf2(a: f32, b: f32) -> i32 {
+        le_abi(a, b)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[alias = __eqdf2, __ltdf2, __nedf2]
+    extern "C" fn __ledf2(a: f64, b: f64) -> i32 {
+        le_abi(a, b)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[alias = __gtsf2]
+    extern "C" fn __gesf2(a: f32, b: f32) -> i32 {
+        ge_abi(a, b)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[alias = __gtdf2]
+    extern "C" fn __gedf2(a: f64, b: f64) -> i32 {
+        ge_abi(a, b)
+    }
+
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) <= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 {
+        (ge_abi(a, b) >= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) == 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) < 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 {
+        (ge_abi(a, b) > 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) <= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 {
+        (ge_abi(a, b) >= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) == 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) < 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 {
+        (ge_abi(a, b) > 0) as i32
+    }
+
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __gesf2vfp(a: f32, b: f32) -> i32 {
+        (ge_abi(a, b) >= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __gedf2vfp(a: f64, b: f64) -> i32 {
+        (ge_abi(a, b) >= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __gtsf2vfp(a: f32, b: f32) -> i32 {
+        (ge_abi(a, b) > 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __gtdf2vfp(a: f64, b: f64) -> i32 {
+        (ge_abi(a, b) > 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __ltsf2vfp(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) < 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __ltdf2vfp(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) < 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __lesf2vfp(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) <= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __ledf2vfp(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) <= 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __nesf2vfp(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) != 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __nedf2vfp(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) != 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __eqsf2vfp(a: f32, b: f32) -> i32 {
+        (le_abi(a, b) == 0) as i32
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    extern "C" fn __eqdf2vfp(a: f64, b: f64) -> i32 {
+        (le_abi(a, b) == 0) as i32
+    }
+}
--- a/embassy-rp/src/float/conv.rs
+++ b/embassy-rp/src/float/conv.rs
@@ -0,0 +1,157 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs
+
+use super::Float;
+use crate::rom_data;
+
+// Some of these are also not connected in the Pico SDK.  This is probably
+// because the ROM version actually does a fixed point conversion, just with
+// the fractional width set to zero.
+
+intrinsics! {
+    // Not connected in the Pico SDK
+    #[slower_than_default]
+    #[aeabi = __aeabi_i2f]
+    extern "C" fn __floatsisf(i: i32) -> f32 {
+        rom_data::float_funcs::int_to_float(i)
+    }
+
+    // Not connected in the Pico SDK
+    #[slower_than_default]
+    #[aeabi = __aeabi_i2d]
+    extern "C" fn __floatsidf(i: i32) -> f64 {
+        rom_data::double_funcs::int_to_double(i)
+    }
+
+    // Questionable gain
+    #[aeabi = __aeabi_l2f]
+    extern "C" fn __floatdisf(i: i64) -> f32 {
+        rom_data::float_funcs::int64_to_float(i)
+    }
+
+    #[bootrom_v2]
+    #[aeabi = __aeabi_l2d]
+    extern "C" fn __floatdidf(i: i64) -> f64 {
+        rom_data::double_funcs::int64_to_double(i)
+    }
+
+    // Not connected in the Pico SDK
+    #[slower_than_default]
+    #[aeabi = __aeabi_ui2f]
+    extern "C" fn __floatunsisf(i: u32) -> f32 {
+        rom_data::float_funcs::uint_to_float(i)
+    }
+
+    // Questionable gain
+    #[bootrom_v2]
+    #[aeabi = __aeabi_ui2d]
+    extern "C" fn __floatunsidf(i: u32) -> f64 {
+        rom_data::double_funcs::uint_to_double(i)
+    }
+
+    // Questionable gain
+    #[bootrom_v2]
+    #[aeabi = __aeabi_ul2f]
+    extern "C" fn __floatundisf(i: u64) -> f32 {
+        rom_data::float_funcs::uint64_to_float(i)
+    }
+
+    #[bootrom_v2]
+    #[aeabi = __aeabi_ul2d]
+    extern "C" fn __floatundidf(i: u64) -> f64 {
+        rom_data::double_funcs::uint64_to_double(i)
+    }
+
+
+    // The Pico SDK does some optimization here (e.x. fast paths for zero and
+    // one), but we can just directly connect it.
+    #[aeabi = __aeabi_f2iz]
+    extern "C" fn __fixsfsi(f: f32) -> i32 {
+        rom_data::float_funcs::float_to_int(f)
+    }
+
+    #[bootrom_v2]
+    #[aeabi = __aeabi_f2lz]
+    extern "C" fn __fixsfdi(f: f32) -> i64 {
+        rom_data::float_funcs::float_to_int64(f)
+    }
+
+    // Not connected in the Pico SDK
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[aeabi = __aeabi_d2iz]
+    extern "C" fn __fixdfsi(f: f64) -> i32 {
+        rom_data::double_funcs::double_to_int(f)
+    }
+
+    // Like with the 32 bit version, there's optimization that we just
+    // skip.
+    #[bootrom_v2]
+    #[aeabi = __aeabi_d2lz]
+    extern "C" fn __fixdfdi(f: f64) -> i64 {
+        rom_data::double_funcs::double_to_int64(f)
+    }
+
+    #[slower_than_default]
+    #[aeabi = __aeabi_f2uiz]
+    extern "C" fn __fixunssfsi(f: f32) -> u32 {
+        rom_data::float_funcs::float_to_uint(f)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[aeabi = __aeabi_f2ulz]
+    extern "C" fn __fixunssfdi(f: f32) -> u64 {
+        rom_data::float_funcs::float_to_uint64(f)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[aeabi = __aeabi_d2uiz]
+    extern "C" fn __fixunsdfsi(f: f64) -> u32 {
+        rom_data::double_funcs::double_to_uint(f)
+    }
+
+    #[slower_than_default]
+    #[bootrom_v2]
+    #[aeabi = __aeabi_d2ulz]
+    extern "C" fn __fixunsdfdi(f: f64) -> u64 {
+        rom_data::double_funcs::double_to_uint64(f)
+    }
+
+    #[bootrom_v2]
+    #[alias = __extendsfdf2vfp]
+    #[aeabi = __aeabi_f2d]
+    extern "C" fn  __extendsfdf2(f: f32) -> f64 {
+        if f.is_not_finite() {
+            return f64::from_repr(
+                // Not finite
+                f64::EXPONENT_MASK |
+                // Preserve NaN or inf
+                ((f.repr() & f32::SIGNIFICAND_MASK) as u64) |
+                // Preserve sign
+                ((f.repr() & f32::SIGN_MASK) as u64) << (f64::BITS-f32::BITS)
+            );
+        }
+        rom_data::float_funcs::float_to_double(f)
+    }
+
+    #[bootrom_v2]
+    #[alias = __truncdfsf2vfp]
+    #[aeabi = __aeabi_d2f]
+    extern "C" fn __truncdfsf2(f: f64) -> f32 {
+        if f.is_not_finite() {
+            let mut repr: u32 =
+                // Not finite
+                f32::EXPONENT_MASK |
+                // Preserve sign
+                ((f.repr() & f64::SIGN_MASK) >> (f64::BITS-f32::BITS)) as u32;
+            // Set NaN
+            if  (f.repr() & f64::SIGNIFICAND_MASK) != 0 {
+                repr |= 1;
+            }
+            return f32::from_repr(repr);
+        }
+        rom_data::double_funcs::double_to_float(f)
+    }
+}
--- a/embassy-rp/src/float/div.rs
+++ b/embassy-rp/src/float/div.rs
@@ -0,0 +1,141 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs
+
+use super::Float;
+use crate::rom_data;
+
+// Make sure this stays as a separate call, because when it's inlined the
+// compiler will move the save of the registers used to contain the divider
+// state into the function prologue.  That save and restore (push/pop) takes
+// longer than the actual division, so doing it in the common case where
+// they are not required wastes a lot of time.
+#[inline(never)]
+#[cold]
+fn save_divider_and_call<F, R>(f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    let sio = rp_pac::SIO;
+
+    unsafe {
+        // Since we can't save the signed-ness of the calculation, we have to make
+        // sure that there's at least an 8 cycle delay before we read the result.
+        // The Pico SDK ensures this by using a 6 cycle push and two 1 cycle reads.
+        // Since we can't be sure the Rust implementation will optimize to the same,
+        // just use an explicit wait.
+        while !sio.div().csr().read().ready() {}
+
+        // Read the quotient last, since that's what clears the dirty flag
+        let dividend = sio.div().udividend().read();
+        let divisor = sio.div().udivisor().read();
+        let remainder = sio.div().remainder().read();
+        let quotient = sio.div().quotient().read();
+
+        // If we get interrupted here (before a write sets the DIRTY flag) its fine, since
+        // we have the full state, so the interruptor doesn't have to restore it.  Once the
+        // write happens and the DIRTY flag is set, the interruptor becomes responsible for
+        // restoring our state.
+        let result = f();
+
+        // If we are interrupted here, then the interruptor will start an incorrect calculation
+        // using a wrong divisor, but we'll restore the divisor and result ourselves correctly.
+        // This sets DIRTY, so any interruptor will save the state.
+        sio.div().udividend().write_value(dividend);
+        // If we are interrupted here, the the interruptor may start the calculation using
+        // incorrectly signed inputs, but we'll restore the result ourselves.
+        // This sets DIRTY, so any interruptor will save the state.
+        sio.div().udivisor().write_value(divisor);
+        // If we are interrupted here, the interruptor will have restored everything but the
+        // quotient may be wrongly signed.  If the calculation started by the above writes is
+        // still ongoing it is stopped, so it won't replace the result we're restoring.
+        // DIRTY and READY set, but only DIRTY matters to make the interruptor save the state.
+        sio.div().remainder().write_value(remainder);
+        // State fully restored after the quotient write.  This sets both DIRTY and READY, so
+        // whatever we may have interrupted can read the result.
+        sio.div().quotient().write_value(quotient);
+
+        result
+    }
+}
+
+fn save_divider<F, R>(f: F) -> R
+where
+    F: FnOnce() -> R,
+{
+    let sio = rp_pac::SIO;
+    if unsafe { !sio.div().csr().read().dirty() } {
+        // Not dirty, so nothing is waiting for the calculation.  So we can just
+        // issue it directly without a save/restore.
+        f()
+    } else {
+        save_divider_and_call(f)
+    }
+}
+
+trait ROMDiv {
+    fn rom_div(self, b: Self) -> Self;
+}
+
+impl ROMDiv for f32 {
+    fn rom_div(self, b: Self) -> Self {
+        // ROM implementation uses the hardware divider, so we have to save it
+        save_divider(|| rom_data::float_funcs::fdiv(self, b))
+    }
+}
+
+impl ROMDiv for f64 {
+    fn rom_div(self, b: Self) -> Self {
+        // ROM implementation uses the hardware divider, so we have to save it
+        save_divider(|| rom_data::double_funcs::ddiv(self, b))
+    }
+}
+
+fn div<F: Float + ROMDiv>(a: F, b: F) -> F {
+    if a.is_not_finite() {
+        if b.is_not_finite() {
+            // inf/NaN / inf/NaN = NaN
+            return F::NAN;
+        }
+
+        if b.is_zero() {
+            // inf/NaN / 0 = NaN
+            return F::NAN;
+        }
+
+        return if b.is_sign_negative() {
+            // [+/-]inf/NaN / (-X) = [-/+]inf/NaN
+            a.negate()
+        } else {
+            // [-]inf/NaN / X = [-]inf/NaN
+            a
+        };
+    }
+
+    if b.is_nan() {
+        // X / NaN = NaN
+        return b;
+    }
+
+    // ROM handles X / 0 = [-]inf and X / [-]inf = [-]0, so we only
+    // need to catch 0 / 0
+    if b.is_zero() && a.is_zero() {
+        return F::NAN;
+    }
+
+    a.rom_div(b)
+}
+
+intrinsics! {
+    #[alias = __divsf3vfp]
+    #[aeabi = __aeabi_fdiv]
+    extern "C" fn __divsf3(a: f32, b: f32) -> f32 {
+        div(a, b)
+    }
+
+    #[bootrom_v2]
+    #[alias = __divdf3vfp]
+    #[aeabi = __aeabi_ddiv]
+    extern "C" fn __divdf3(a: f64, b: f64) -> f64 {
+        div(a, b)
+    }
+}
--- a/embassy-rp/src/float/functions.rs
+++ b/embassy-rp/src/float/functions.rs
@@ -0,0 +1,239 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/functions.rs
+
+use crate::float::{Float, Int};
+use crate::rom_data;
+
+trait ROMFunctions {
+    fn sqrt(self) -> Self;
+    fn ln(self) -> Self;
+    fn exp(self) -> Self;
+    fn sin(self) -> Self;
+    fn cos(self) -> Self;
+    fn tan(self) -> Self;
+    fn atan2(self, y: Self) -> Self;
+
+    fn to_trig_range(self) -> Self;
+}
+
+impl ROMFunctions for f32 {
+    fn sqrt(self) -> Self {
+        rom_data::float_funcs::fsqrt(self)
+    }
+
+    fn ln(self) -> Self {
+        rom_data::float_funcs::fln(self)
+    }
+
+    fn exp(self) -> Self {
+        rom_data::float_funcs::fexp(self)
+    }
+
+    fn sin(self) -> Self {
+        rom_data::float_funcs::fsin(self)
+    }
+
+    fn cos(self) -> Self {
+        rom_data::float_funcs::fcos(self)
+    }
+
+    fn tan(self) -> Self {
+        rom_data::float_funcs::ftan(self)
+    }
+
+    fn atan2(self, y: Self) -> Self {
+        rom_data::float_funcs::fatan2(self, y)
+    }
+
+    fn to_trig_range(self) -> Self {
+        // -128 < X < 128, logic from the Pico SDK
+        let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS;
+        if exponent < 134 {
+            self
+        } else {
+            self % (core::f32::consts::PI * 2.0)
+        }
+    }
+}
+
+impl ROMFunctions for f64 {
+    fn sqrt(self) -> Self {
+        rom_data::double_funcs::dsqrt(self)
+    }
+
+    fn ln(self) -> Self {
+        rom_data::double_funcs::dln(self)
+    }
+
+    fn exp(self) -> Self {
+        rom_data::double_funcs::dexp(self)
+    }
+
+    fn sin(self) -> Self {
+        rom_data::double_funcs::dsin(self)
+    }
+
+    fn cos(self) -> Self {
+        rom_data::double_funcs::dcos(self)
+    }
+    fn tan(self) -> Self {
+        rom_data::double_funcs::dtan(self)
+    }
+
+    fn atan2(self, y: Self) -> Self {
+        rom_data::double_funcs::datan2(self, y)
+    }
+
+    fn to_trig_range(self) -> Self {
+        // -1024 < X < 1024, logic from the Pico SDK
+        let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS;
+        if exponent < 1033 {
+            self
+        } else {
+            self % (core::f64::consts::PI * 2.0)
+        }
+    }
+}
+
+fn is_negative_nonzero_or_nan<F: Float>(f: F) -> bool {
+    let repr = f.repr();
+    if (repr & F::SIGN_MASK) != F::Int::ZERO {
+        // Negative, so anything other than exactly zero
+        return (repr & (!F::SIGN_MASK)) != F::Int::ZERO;
+    }
+    // NaN
+    (repr & (F::EXPONENT_MASK | F::SIGNIFICAND_MASK)) > F::EXPONENT_MASK
+}
+
+fn sqrt<F: Float + ROMFunctions>(f: F) -> F {
+    if is_negative_nonzero_or_nan(f) {
+        F::NAN
+    } else {
+        f.sqrt()
+    }
+}
+
+fn ln<F: Float + ROMFunctions>(f: F) -> F {
+    if is_negative_nonzero_or_nan(f) {
+        F::NAN
+    } else {
+        f.ln()
+    }
+}
+
+fn exp<F: Float + ROMFunctions>(f: F) -> F {
+    if f.is_nan() {
+        F::NAN
+    } else {
+        f.exp()
+    }
+}
+
+fn sin<F: Float + ROMFunctions>(f: F) -> F {
+    if f.is_not_finite() {
+        F::NAN
+    } else {
+        f.to_trig_range().sin()
+    }
+}
+
+fn cos<F: Float + ROMFunctions>(f: F) -> F {
+    if f.is_not_finite() {
+        F::NAN
+    } else {
+        f.to_trig_range().cos()
+    }
+}
+
+fn tan<F: Float + ROMFunctions>(f: F) -> F {
+    if f.is_not_finite() {
+        F::NAN
+    } else {
+        f.to_trig_range().tan()
+    }
+}
+
+fn atan2<F: Float + ROMFunctions>(x: F, y: F) -> F {
+    if x.is_nan() || y.is_nan() {
+        F::NAN
+    } else {
+        x.to_trig_range().atan2(y)
+    }
+}
+
+// Name collisions
+mod intrinsics {
+    intrinsics! {
+        extern "C" fn sqrtf(f: f32) -> f32 {
+            super::sqrt(f)
+        }
+
+        #[bootrom_v2]
+        extern "C" fn sqrt(f: f64) -> f64 {
+            super::sqrt(f)
+        }
+
+        extern "C" fn logf(f: f32) -> f32 {
+            super::ln(f)
+        }
+
+        #[bootrom_v2]
+        extern "C" fn log(f: f64) -> f64 {
+            super::ln(f)
+        }
+
+        extern "C" fn expf(f: f32) -> f32 {
+            super::exp(f)
+        }
+
+        #[bootrom_v2]
+        extern "C" fn exp(f: f64) -> f64 {
+            super::exp(f)
+        }
+
+        #[slower_than_default]
+        extern "C" fn sinf(f: f32) -> f32 {
+            super::sin(f)
+        }
+
+        #[slower_than_default]
+        #[bootrom_v2]
+        extern "C" fn sin(f: f64) -> f64 {
+            super::sin(f)
+        }
+
+        #[slower_than_default]
+        extern "C" fn cosf(f: f32) -> f32 {
+            super::cos(f)
+        }
+
+        #[slower_than_default]
+        #[bootrom_v2]
+        extern "C" fn cos(f: f64) -> f64 {
+            super::cos(f)
+        }
+
+        #[slower_than_default]
+        extern "C" fn tanf(f: f32) -> f32 {
+            super::tan(f)
+        }
+
+        #[slower_than_default]
+        #[bootrom_v2]
+        extern "C" fn tan(f: f64) -> f64 {
+            super::tan(f)
+        }
+
+        // Questionable gain
+        #[bootrom_v2]
+        extern "C" fn atan2f(a: f32, b: f32) -> f32 {
+            super::atan2(a, b)
+        }
+
+        // Questionable gain
+        #[bootrom_v2]
+        extern "C" fn atan2(a: f64, b: f64) -> f64 {
+            super::atan2(a, b)
+        }
+    }
+}
--- a/embassy-rp/src/float/mod.rs
+++ b/embassy-rp/src/float/mod.rs
@@ -0,0 +1,149 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mod.rs
+
+use core::ops;
+
+// Borrowed and simplified from compiler-builtins so we can use bit ops
+// on floating point without macro soup.
+pub(crate) trait Int:
+    Copy
+    + core::fmt::Debug
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shl<u32, Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitOr<Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+    + ops::Not<Output = Self>
+{
+    const ZERO: Self;
+}
+
+macro_rules! int_impl {
+    ($ty:ty) => {
+        impl Int for $ty {
+            const ZERO: Self = 0;
+        }
+    };
+}
+
+int_impl!(u32);
+int_impl!(u64);
+
+pub(crate) trait Float:
+    Copy
+    + core::fmt::Debug
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::MulAssign
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Rem<Output = Self>
+{
+    /// A uint of the same with as the float
+    type Int: Int;
+
+    /// NaN representation for the float
+    const NAN: Self;
+
+    /// The bitwidth of the float type
+    const BITS: u32;
+
+    /// The bitwidth of the significand
+    const SIGNIFICAND_BITS: u32;
+
+    /// A mask for the sign bit
+    const SIGN_MASK: Self::Int;
+
+    /// A mask for the significand
+    const SIGNIFICAND_MASK: Self::Int;
+
+    /// A mask for the exponent
+    const EXPONENT_MASK: Self::Int;
+
+    /// Returns `self` transmuted to `Self::Int`
+    fn repr(self) -> Self::Int;
+
+    /// Returns a `Self::Int` transmuted back to `Self`
+    fn from_repr(a: Self::Int) -> Self;
+
+    /// Return a sign swapped `self`
+    fn negate(self) -> Self;
+
+    /// Returns true if `self` is either NaN or infinity
+    fn is_not_finite(self) -> bool {
+        (self.repr() & Self::EXPONENT_MASK) == Self::EXPONENT_MASK
+    }
+
+    /// Returns true if `self` is infinity
+    fn is_infinity(self) -> bool {
+        (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) == Self::EXPONENT_MASK
+    }
+
+    /// Returns true if `self is NaN
+    fn is_nan(self) -> bool {
+        (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) > Self::EXPONENT_MASK
+    }
+
+    /// Returns true if `self` is negative
+    fn is_sign_negative(self) -> bool {
+        (self.repr() & Self::SIGN_MASK) != Self::Int::ZERO
+    }
+
+    /// Returns true if `self` is zero (either sign)
+    fn is_zero(self) -> bool {
+        (self.repr() & (Self::SIGNIFICAND_MASK | Self::EXPONENT_MASK)) == Self::Int::ZERO
+    }
+}
+
+macro_rules! float_impl {
+    ($ty:ident, $ity:ident, $bits:expr, $significand_bits:expr) => {
+        impl Float for $ty {
+            type Int = $ity;
+
+            const NAN: Self = <$ty>::NAN;
+
+            const BITS: u32 = $bits;
+            const SIGNIFICAND_BITS: u32 = $significand_bits;
+
+            const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1);
+            const SIGNIFICAND_MASK: Self::Int = (1 << Self::SIGNIFICAND_BITS) - 1;
+            const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK);
+
+            fn repr(self) -> Self::Int {
+                self.to_bits()
+            }
+
+            fn from_repr(a: Self::Int) -> Self {
+                Self::from_bits(a)
+            }
+
+            fn negate(self) -> Self {
+                -self
+            }
+        }
+    };
+}
+
+float_impl!(f32, u32, 32, 23);
+float_impl!(f64, u64, 64, 52);
+
+mod add_sub;
+mod cmp;
+mod conv;
+mod div;
+mod functions;
+mod mul;
--- a/embassy-rp/src/float/mul.rs
+++ b/embassy-rp/src/float/mul.rs
@@ -0,0 +1,70 @@
+// Credit: taken from `rp-hal` (also licensed Apache+MIT)
+// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mul.rs
+
+use super::Float;
+use crate::rom_data;
+
+trait ROMMul {
+    fn rom_mul(self, b: Self) -> Self;
+}
+
+impl ROMMul for f32 {
+    fn rom_mul(self, b: Self) -> Self {
+        rom_data::float_funcs::fmul(self, b)
+    }
+}
+
+impl ROMMul for f64 {
+    fn rom_mul(self, b: Self) -> Self {
+        rom_data::double_funcs::dmul(self, b)
+    }
+}
+
+fn mul<F: Float + ROMMul>(a: F, b: F) -> F {
+    if a.is_not_finite() {
+        if b.is_zero() {
+            // [-]inf/NaN * 0 = NaN
+            return F::NAN;
+        }
+
+        return if b.is_sign_negative() {
+            // [+/-]inf/NaN * (-X) = [-/+]inf/NaN
+            a.negate()
+        } else {
+            // [-]inf/NaN * X = [-]inf/NaN
+            a
+        };
+    }
+
+    if b.is_not_finite() {
+        if a.is_zero() {
+            // 0 * [-]inf/NaN = NaN
+            return F::NAN;
+        }
+
+        return if b.is_sign_negative() {
+            // (-X) * [+/-]inf/NaN = [-/+]inf/NaN
+            b.negate()
+        } else {
+            // X * [-]inf/NaN = [-]inf/NaN
+            b
+        };
+    }
+
+    a.rom_mul(b)
+}
+
+intrinsics! {
+    #[alias = __mulsf3vfp]
+    #[aeabi = __aeabi_fmul]
+    extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
+        mul(a, b)
+    }
+
+    #[bootrom_v2]
+    #[alias = __muldf3vfp]
+    #[aeabi = __aeabi_dmul]
+    extern "C" fn __muldf3(a: f64, b: f64) -> f64 {
+        mul(a, b)
+    }
+}
--- a/embassy-rp/src/lib.rs
+++ b/embassy-rp/src/lib.rs
@@ -12,6 +12,7 @@ mod intrinsics;

 pub mod adc;
 pub mod dma;
+mod float;
 pub mod gpio;
 pub mod i2c;
 pub mod interrupt;
--- a/embassy-rp/src/rom_data.rs
+++ b/embassy-rp/src/rom_data.rs
@@ -56,50 +56,11 @@ macro_rules! declare_rom_function {
        fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty
        $lookup:block
    ) => {
-        #[doc = r"Additional access for the `"]
-        #[doc = stringify!($name)]
-        #[doc = r"` ROM function."]
-        pub mod $name {
-            /// Retrieve a function pointer.
-            #[cfg(not(feature = "rom-func-cache"))]
-            pub fn ptr() -> extern "C" fn( $($argname: $ty),* ) -> $ret {
-                let p: *const u32 = $lookup;
-                unsafe {
-                    let func : extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p);
-                    func
-                }
-            }
-
-            /// Retrieve a function pointer.
-            #[cfg(feature = "rom-func-cache")]
-            pub fn ptr() -> extern "C" fn( $($argname: $ty),* ) -> $ret {
-                use core::sync::atomic::{AtomicU16, Ordering};
-
-                // All pointers in the ROM fit in 16 bits, so we don't need a
-                // full width word to store the cached value.
-                static CACHED_PTR: AtomicU16 = AtomicU16::new(0);
-                // This is safe because the lookup will always resolve
-                // to the same value.  So even if an interrupt or another
-                // core starts at the same time, it just repeats some
-                // work and eventually writes back the correct value.
-                let p: *const u32 = match CACHED_PTR.load(Ordering::Relaxed) {
-                    0 => {
-                        let raw: *const u32 = $lookup;
-                        CACHED_PTR.store(raw as u16, Ordering::Relaxed);
-                        raw
-                    },
-                    val => val as *const u32,
-                };
-                unsafe {
-                    let func : extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p);
-                    func
-                }
-            }
-        }
-
+        declare_rom_function!{
+            __internal ,
            $(#[$outer])*
-        pub extern "C" fn $name( $($argname: $ty),* ) -> $ret {
-            $name::ptr()($($argname),*)
+            fn $name( $($argname: $ty),* ) -> $ret
+            $lookup
        }
    };

@@ -107,6 +68,21 @@ macro_rules! declare_rom_function {
        $(#[$outer:meta])*
        unsafe fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty
        $lookup:block
+    ) => {
+        declare_rom_function!{
+            __internal unsafe ,
+            $(#[$outer])*
+            fn $name( $($argname: $ty),* ) -> $ret
+            $lookup
+        }
+    };
+
+    (
+        __internal
+        $( $maybe_unsafe:ident )? ,
+        $(#[$outer:meta])*
+        fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty
+        $lookup:block
    ) => {
        #[doc = r"Additional access for the `"]
        #[doc = stringify!($name)]
@@ -114,43 +90,58 @@ macro_rules! declare_rom_function {
        pub mod $name {
            /// Retrieve a function pointer.
            #[cfg(not(feature = "rom-func-cache"))]
-            pub fn ptr() -> unsafe extern "C" fn( $($argname: $ty),* ) -> $ret {
+            pub fn ptr() -> $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret {
                let p: *const u32 = $lookup;
                unsafe {
-                    let func : unsafe extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p);
+                    let func : $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret
+                        = core::mem::transmute(p);
                    func
                }
            }

+            #[cfg(feature = "rom-func-cache")]
+            // unlike rp2040-hal we store a full word, containing the full function pointer.
+            // rp2040-hal saves two bytes by storing only the rom offset, at the cost of
+            // having to do an indirection and an atomic operation on every rom call.
+            static mut CACHE: $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret
+                = trampoline;
+
+            #[cfg(feature = "rom-func-cache")]
+            $( $maybe_unsafe )? extern "C" fn trampoline( $($argname: $ty),* ) -> $ret {
+                use core::sync::atomic::{compiler_fence, Ordering};
+
+                let p: *const u32 = $lookup;
+                #[allow(unused_unsafe)]
+                unsafe {
+                    CACHE = core::mem::transmute(p);
+                    compiler_fence(Ordering::Release);
+                    CACHE($($argname),*)
+                }
+            }
+
            /// Retrieve a function pointer.
            #[cfg(feature = "rom-func-cache")]
-            pub fn ptr() -> unsafe extern "C" fn( $($argname: $ty),* ) -> $ret {
-                use core::sync::atomic::{AtomicU16, Ordering};
+            pub fn ptr() -> $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret {
+                use core::sync::atomic::{compiler_fence, Ordering};

-                // All pointers in the ROM fit in 16 bits, so we don't need a
-                // full width word to store the cached value.
-                static CACHED_PTR: AtomicU16 = AtomicU16::new(0);
                // This is safe because the lookup will always resolve
                // to the same value.  So even if an interrupt or another
                // core starts at the same time, it just repeats some
                // work and eventually writes back the correct value.
-                let p: *const u32 = match CACHED_PTR.load(Ordering::Relaxed) {
-                    0 => {
-                        let raw: *const u32 = $lookup;
-                        CACHED_PTR.store(raw as u16, Ordering::Relaxed);
-                        raw
-                    },
-                    val => val as *const u32,
-                };
+                //
+                // We easily get away with using only compiler fences here
+                // because RP2040 SRAM is not cached. If it were we'd need
+                // to make sure updates propagate quickly, or just take the
+                // hit and let each core resolve every function once.
+                compiler_fence(Ordering::Acquire);
                unsafe {
-                    let func : unsafe extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p);
-                    func
+                    CACHE
                }
            }
        }

        $(#[$outer])*
-        pub unsafe extern "C" fn $name( $($argname: $ty),* ) -> $ret {
+        pub $( $maybe_unsafe )? extern "C" fn $name( $($argname: $ty),* ) -> $ret {
            $name::ptr()($($argname),*)
        }
    };
@@ -369,6 +360,7 @@ pub fn fplib_start() -> *const u8 {
 }

 /// See Table 180 in the RP2040 datasheet for the contents of this table.
+#[cfg_attr(feature = "rom-func-cache", inline(never))]
 pub fn soft_float_table() -> *const usize {
    rom_table_lookup(DATA_TABLE, *b"SF")
 }
@@ -379,6 +371,7 @@ pub fn fplib_end() -> *const u8 {
 }

 /// This entry is only present in the V2 bootrom. See Table 182 in the RP2040 datasheet for the contents of this table.
+#[cfg_attr(feature = "rom-func-cache", inline(never))]
 pub fn soft_double_table() -> *const usize {
    if rom_version_number() < 2 {
        panic!(
--- a/embassy-stm32/src/i2c/v2.rs
+++ b/embassy-stm32/src/i2c/v2.rs
@@ -1,6 +1,5 @@
 use core::cmp;
 use core::future::poll_fn;
-use core::sync::atomic::{AtomicUsize, Ordering};
 use core::task::Poll;

 use embassy_embedded_hal::SetConfig;
@@ -35,14 +34,12 @@ impl Default for Config {

 pub struct State {
    waker: AtomicWaker,
-    chunks_transferred: AtomicUsize,
 }

 impl State {
    pub(crate) const fn new() -> Self {
        Self {
            waker: AtomicWaker::new(),
-            chunks_transferred: AtomicUsize::new(0),
        }
    }
 }
@@ -130,10 +127,7 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
        let isr = regs.isr().read();

        if isr.tcr() || isr.tc() {
-            let state = T::state();
-            let transferred = state.chunks_transferred.load(Ordering::Relaxed);
-            state.chunks_transferred.store(transferred + 1, Ordering::Relaxed);
-            state.waker.wake();
+            T::state().waker.wake();
        }
        // The flag can only be cleared by writting to nbytes, we won't do that here, so disable
        // the interrupt
@@ -457,12 +451,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
        TXDMA: crate::i2c::TxDma<T>,
    {
        let total_len = write.len();
-        let completed_chunks = total_len / 255;
-        let total_chunks = if completed_chunks * 255 == total_len {
-            completed_chunks
-        } else {
-            completed_chunks + 1
-        };

        let dma_transfer = unsafe {
            let regs = T::regs();
@@ -480,7 +468,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
        };

        let state = T::state();
-        state.chunks_transferred.store(0, Ordering::Relaxed);
        let mut remaining_len = total_len;

        let on_drop = OnDrop::new(|| {
@@ -495,6 +482,11 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
            }
        });

+        poll_fn(|cx| {
+            state.waker.register(cx.waker());
+
+            let isr = unsafe { T::regs().isr().read() };
+            if remaining_len == total_len {
                // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers
                if first_slice {
                    unsafe {
@@ -502,26 +494,23 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
                            address,
                            total_len.min(255),
                            Stop::Software,
-                    (total_chunks != 1) || !last_slice,
+                            (total_len > 255) || !last_slice,
                            &check_timeout,
                        )?;
                    }
                } else {
                    unsafe {
-                Self::master_continue(total_len.min(255), (total_chunks != 1) || !last_slice, &check_timeout)?;
+                        Self::master_continue(total_len.min(255), (total_len > 255) || !last_slice, &check_timeout)?;
                        T::regs().cr1().modify(|w| w.set_tcie(true));
                    }
                }
-
-        poll_fn(|cx| {
-            state.waker.register(cx.waker());
-            let chunks_transferred = state.chunks_transferred.load(Ordering::Relaxed);
-
-            if chunks_transferred == total_chunks {
+            } else if !(isr.tcr() || isr.tc()) {
+                // poll_fn was woken without an interrupt present
+                return Poll::Pending;
+            } else if remaining_len == 0 {
                return Poll::Ready(Ok(()));
-            } else if chunks_transferred != 0 {
-                remaining_len = remaining_len.saturating_sub(255);
-                let last_piece = (chunks_transferred + 1 == total_chunks) && last_slice;
+            } else {
+                let last_piece = (remaining_len <= 255) && last_slice;

                // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers
                unsafe {
@@ -531,6 +520,8 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
                    T::regs().cr1().modify(|w| w.set_tcie(true));
                }
            }
+
+            remaining_len = remaining_len.saturating_sub(255);
            Poll::Pending
        })
        .await?;
@@ -559,12 +550,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
        RXDMA: crate::i2c::RxDma<T>,
    {
        let total_len = buffer.len();
-        let completed_chunks = total_len / 255;
-        let total_chunks = if completed_chunks * 255 == total_len {
-            completed_chunks
-        } else {
-            completed_chunks + 1
-        };

        let dma_transfer = unsafe {
            let regs = T::regs();
@@ -580,7 +565,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
        };

        let state = T::state();
-        state.chunks_transferred.store(0, Ordering::Relaxed);
        let mut remaining_len = total_len;

        let on_drop = OnDrop::new(|| {
@@ -593,27 +577,29 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
            }
        });

+        poll_fn(|cx| {
+            state.waker.register(cx.waker());
+
+            let isr = unsafe { T::regs().isr().read() };
+            if remaining_len == total_len {
                // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers
                unsafe {
                    Self::master_read(
                        address,
                        total_len.min(255),
                        Stop::Software,
-                total_chunks != 1,
+                        total_len > 255,
                        restart,
                        &check_timeout,
                    )?;
                }
-
-        poll_fn(|cx| {
-            state.waker.register(cx.waker());
-            let chunks_transferred = state.chunks_transferred.load(Ordering::Relaxed);
-
-            if chunks_transferred == total_chunks {
+            } else if !(isr.tcr() || isr.tc()) {
+                // poll_fn was woken without an interrupt present
+                return Poll::Pending;
+            } else if remaining_len == 0 {
                return Poll::Ready(Ok(()));
-            } else if chunks_transferred != 0 {
-                remaining_len = remaining_len.saturating_sub(255);
-                let last_piece = chunks_transferred + 1 == total_chunks;
+            } else {
+                let last_piece = remaining_len <= 255;

                // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers
                unsafe {
@@ -623,6 +609,8 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> {
                    T::regs().cr1().modify(|w| w.set_tcie(true));
                }
            }
+
+            remaining_len = remaining_len.saturating_sub(255);
            Poll::Pending
        })
        .await?;
--- a/tests/rp/.cargo/config.toml
+++ b/tests/rp/.cargo/config.toml
@@ -1,6 +1,8 @@
 [unstable]
-build-std = ["core"]
-build-std-features = ["panic_immediate_abort"]
+# enabling these breaks the float tests during linking, with intrinsics
+# duplicated between embassy-rp and compilter_builtins
+#build-std = ["core"]
+#build-std-features = ["panic_immediate_abort"]

 [target.'cfg(all(target_arch = "arm", target_os = "none"))']
 #runner = "teleprobe client run --target rpi-pico --elf"
--- a/tests/rp/Cargo.toml
+++ b/tests/rp/Cargo.toml
@@ -8,7 +8,7 @@ license = "MIT OR Apache-2.0"
 embassy-sync = { version = "0.2.0", path = "../../embassy-sync", features = ["defmt"] }
 embassy-executor = { version = "0.1.0", path = "../../embassy-executor", features = ["arch-cortex-m", "executor-thread", "defmt", "integrated-timers"] }
 embassy-time = { version = "0.1.0", path = "../../embassy-time", features = ["defmt"] }
-embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl"]  }
+embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl", "intrinsics", "rom-v2-intrinsics"]  }
 embassy-futures = { version = "0.1.0", path = "../../embassy-futures" }

 defmt = "0.3.0"
--- a/tests/rp/src/bin/float.rs
+++ b/tests/rp/src/bin/float.rs
@@ -0,0 +1,53 @@
+#![no_std]
+#![no_main]
+#![feature(type_alias_impl_trait)]
+
+use defmt::*;
+use embassy_executor::Spawner;
+use embassy_rp::pac;
+use embassy_time::{Duration, Timer};
+use {defmt_rtt as _, panic_probe as _};
+
+#[embassy_executor::main]
+async fn main(_spawner: Spawner) {
+    embassy_rp::init(Default::default());
+    info!("Hello World!");
+
+    const PI_F: f32 = 3.1415926535f32;
+    const PI_D: f64 = 3.14159265358979323846f64;
+
+    unsafe {
+        pac::BUSCTRL
+            .perfsel(0)
+            .write(|r| r.set_perfsel(pac::busctrl::vals::Perfsel::ROM));
+    }
+
+    for i in 0..=360 {
+        let rad_f = (i as f32) * PI_F / 180.0;
+        info!(
+            "{}° float: {=f32} / {=f32} / {=f32} / {=f32}",
+            i,
+            rad_f,
+            rad_f - PI_F,
+            rad_f + PI_F,
+            rad_f % PI_F
+        );
+        let rad_d = (i as f64) * PI_D / 180.0;
+        info!(
+            "{}° double: {=f64} / {=f64} / {=f64} / {=f64}",
+            i,
+            rad_d,
+            rad_d - PI_D,
+            rad_d + PI_D,
+            rad_d % PI_D
+        );
+        Timer::after(Duration::from_millis(10)).await;
+    }
+
+    let rom_accesses = unsafe { pac::BUSCTRL.perfctr(0).read().perfctr() };
+    // every float operation used here uses at least 10 cycles
+    defmt::assert!(rom_accesses >= 360 * 12 * 10);
+
+    info!("Test OK");
+    cortex_m::asm::bkpt();
+}