From f5216624bb228b5b70f97f0e7ebc40ea4d7e1a27 Mon Sep 17 00:00:00 2001 From: xoviat Date: Mon, 17 Apr 2023 15:24:24 -0500 Subject: [PATCH 1/6] stm32/i2c: fix races when using dma. fixes #1341. --- embassy-stm32/src/i2c/v2.rs | 103 ++++++++++++++---------------------- 1 file changed, 41 insertions(+), 62 deletions(-) diff --git a/embassy-stm32/src/i2c/v2.rs b/embassy-stm32/src/i2c/v2.rs index 7218f770..44237b89 100644 --- a/embassy-stm32/src/i2c/v2.rs +++ b/embassy-stm32/src/i2c/v2.rs @@ -1,6 +1,5 @@ use core::cmp; use core::future::poll_fn; -use core::sync::atomic::{AtomicUsize, Ordering}; use core::task::Poll; use embassy_embedded_hal::SetConfig; @@ -35,14 +34,12 @@ impl Default for Config { pub struct State { waker: AtomicWaker, - chunks_transferred: AtomicUsize, } impl State { pub(crate) const fn new() -> Self { Self { waker: AtomicWaker::new(), - chunks_transferred: AtomicUsize::new(0), } } } @@ -130,10 +127,7 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { let isr = regs.isr().read(); if isr.tcr() || isr.tc() { - let state = T::state(); - let transferred = state.chunks_transferred.load(Ordering::Relaxed); - state.chunks_transferred.store(transferred + 1, Ordering::Relaxed); - state.waker.wake(); + T::state().waker.wake(); } // The flag can only be cleared by writting to nbytes, we won't do that here, so disable // the interrupt @@ -457,12 +451,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { TXDMA: crate::i2c::TxDma, { let total_len = write.len(); - let completed_chunks = total_len / 255; - let total_chunks = if completed_chunks * 255 == total_len { - completed_chunks - } else { - completed_chunks + 1 - }; let dma_transfer = unsafe { let regs = T::regs(); @@ -480,7 +468,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { }; let state = T::state(); - state.chunks_transferred.store(0, Ordering::Relaxed); let mut remaining_len = total_len; let on_drop = OnDrop::new(|| { @@ -495,33 +482,31 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { } }); - // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers - if first_slice { - unsafe { - Self::master_write( - address, - total_len.min(255), - Stop::Software, - (total_chunks != 1) || !last_slice, - &check_timeout, - )?; - } - } else { - unsafe { - Self::master_continue(total_len.min(255), (total_chunks != 1) || !last_slice, &check_timeout)?; - T::regs().cr1().modify(|w| w.set_tcie(true)); - } - } - poll_fn(|cx| { state.waker.register(cx.waker()); - let chunks_transferred = state.chunks_transferred.load(Ordering::Relaxed); - if chunks_transferred == total_chunks { + if remaining_len == total_len { + // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers + if first_slice { + unsafe { + Self::master_write( + address, + total_len.min(255), + Stop::Software, + (total_len > 255) || !last_slice, + &check_timeout, + )?; + } + } else { + unsafe { + Self::master_continue(total_len.min(255), (total_len > 255) || !last_slice, &check_timeout)?; + T::regs().cr1().modify(|w| w.set_tcie(true)); + } + } + } else if remaining_len == 0 { return Poll::Ready(Ok(())); - } else if chunks_transferred != 0 { - remaining_len = remaining_len.saturating_sub(255); - let last_piece = (chunks_transferred + 1 == total_chunks) && last_slice; + } else { + let last_piece = (remaining_len <= 255) && last_slice; // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers unsafe { @@ -531,6 +516,8 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { T::regs().cr1().modify(|w| w.set_tcie(true)); } } + + remaining_len = remaining_len.saturating_sub(255); Poll::Pending }) .await?; @@ -559,12 +546,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { RXDMA: crate::i2c::RxDma, { let total_len = buffer.len(); - let completed_chunks = total_len / 255; - let total_chunks = if completed_chunks * 255 == total_len { - completed_chunks - } else { - completed_chunks + 1 - }; let dma_transfer = unsafe { let regs = T::regs(); @@ -580,7 +561,6 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { }; let state = T::state(); - state.chunks_transferred.store(0, Ordering::Relaxed); let mut remaining_len = total_len; let on_drop = OnDrop::new(|| { @@ -593,27 +573,24 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { } }); - // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers - unsafe { - Self::master_read( - address, - total_len.min(255), - Stop::Software, - total_chunks != 1, - restart, - &check_timeout, - )?; - } - poll_fn(|cx| { state.waker.register(cx.waker()); - let chunks_transferred = state.chunks_transferred.load(Ordering::Relaxed); - - if chunks_transferred == total_chunks { + if remaining_len == total_len { + // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers + unsafe { + Self::master_read( + address, + total_len.min(255), + Stop::Software, + total_len > 255, + restart, + &check_timeout, + )?; + } + } else if remaining_len == 0 { return Poll::Ready(Ok(())); - } else if chunks_transferred != 0 { - remaining_len = remaining_len.saturating_sub(255); - let last_piece = chunks_transferred + 1 == total_chunks; + } else { + let last_piece = remaining_len <= 255; // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers unsafe { @@ -623,6 +600,8 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { T::regs().cr1().modify(|w| w.set_tcie(true)); } } + + remaining_len = remaining_len.saturating_sub(255); Poll::Pending }) .await?; From fdd6e08ed6b5445a53c8cd0752f80f203c4860ac Mon Sep 17 00:00:00 2001 From: pennae Date: Wed, 19 Apr 2023 01:57:37 +0200 Subject: [PATCH 2/6] rp: hook up softfloat rom intrinsics rp-hal has done this very well already, so we'll just copy their entire impl again. only div.rs needed some massaging because our sio access works a little differently, everything else worked as is. --- embassy-rp/src/float/add_sub.rs | 92 ++++++++++++ embassy-rp/src/float/cmp.rs | 201 +++++++++++++++++++++++++ embassy-rp/src/float/conv.rs | 157 ++++++++++++++++++++ embassy-rp/src/float/div.rs | 141 ++++++++++++++++++ embassy-rp/src/float/functions.rs | 239 ++++++++++++++++++++++++++++++ embassy-rp/src/float/mod.rs | 149 +++++++++++++++++++ embassy-rp/src/float/mul.rs | 70 +++++++++ embassy-rp/src/lib.rs | 1 + tests/rp/.cargo/config.toml | 6 +- tests/rp/Cargo.toml | 2 +- tests/rp/src/bin/float.rs | 53 +++++++ 11 files changed, 1108 insertions(+), 3 deletions(-) create mode 100644 embassy-rp/src/float/add_sub.rs create mode 100644 embassy-rp/src/float/cmp.rs create mode 100644 embassy-rp/src/float/conv.rs create mode 100644 embassy-rp/src/float/div.rs create mode 100644 embassy-rp/src/float/functions.rs create mode 100644 embassy-rp/src/float/mod.rs create mode 100644 embassy-rp/src/float/mul.rs create mode 100644 tests/rp/src/bin/float.rs diff --git a/embassy-rp/src/float/add_sub.rs b/embassy-rp/src/float/add_sub.rs new file mode 100644 index 00000000..673544cf --- /dev/null +++ b/embassy-rp/src/float/add_sub.rs @@ -0,0 +1,92 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/add_sub.rs + +use super::{Float, Int}; +use crate::rom_data; + +trait ROMAdd { + fn rom_add(self, b: Self) -> Self; +} + +impl ROMAdd for f32 { + fn rom_add(self, b: Self) -> Self { + rom_data::float_funcs::fadd(self, b) + } +} + +impl ROMAdd for f64 { + fn rom_add(self, b: Self) -> Self { + rom_data::double_funcs::dadd(self, b) + } +} + +fn add(a: F, b: F) -> F { + if a.is_not_finite() { + if b.is_not_finite() { + let class_a = a.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK); + let class_b = b.repr() & (F::SIGNIFICAND_MASK | F::SIGN_MASK); + + if class_a == F::Int::ZERO && class_b == F::Int::ZERO { + // inf + inf = inf + return a; + } + if class_a == F::SIGN_MASK && class_b == F::SIGN_MASK { + // -inf + (-inf) = -inf + return a; + } + + // Sign mismatch, or either is NaN already + return F::NAN; + } + + // [-]inf/NaN + X = [-]inf/NaN + return a; + } + + if b.is_not_finite() { + // X + [-]inf/NaN = [-]inf/NaN + return b; + } + + a.rom_add(b) +} + +intrinsics! { + #[alias = __addsf3vfp] + #[aeabi = __aeabi_fadd] + extern "C" fn __addsf3(a: f32, b: f32) -> f32 { + add(a, b) + } + + #[bootrom_v2] + #[alias = __adddf3vfp] + #[aeabi = __aeabi_dadd] + extern "C" fn __adddf3(a: f64, b: f64) -> f64 { + add(a, b) + } + + // The ROM just implements subtraction the same way, so just do it here + // and save the work of implementing more complicated NaN/inf handling. + + #[alias = __subsf3vfp] + #[aeabi = __aeabi_fsub] + extern "C" fn __subsf3(a: f32, b: f32) -> f32 { + add(a, -b) + } + + #[bootrom_v2] + #[alias = __subdf3vfp] + #[aeabi = __aeabi_dsub] + extern "C" fn __subdf3(a: f64, b: f64) -> f64 { + add(a, -b) + } + + extern "aapcs" fn __aeabi_frsub(a: f32, b: f32) -> f32 { + add(b, -a) + } + + #[bootrom_v2] + extern "aapcs" fn __aeabi_drsub(a: f64, b: f64) -> f64 { + add(b, -a) + } +} diff --git a/embassy-rp/src/float/cmp.rs b/embassy-rp/src/float/cmp.rs new file mode 100644 index 00000000..e540e391 --- /dev/null +++ b/embassy-rp/src/float/cmp.rs @@ -0,0 +1,201 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/cmp.rs + +use super::Float; +use crate::rom_data; + +trait ROMCmp { + fn rom_cmp(self, b: Self) -> i32; +} + +impl ROMCmp for f32 { + fn rom_cmp(self, b: Self) -> i32 { + rom_data::float_funcs::fcmp(self, b) + } +} + +impl ROMCmp for f64 { + fn rom_cmp(self, b: Self) -> i32 { + rom_data::double_funcs::dcmp(self, b) + } +} + +fn le_abi(a: F, b: F) -> i32 { + if a.is_nan() || b.is_nan() { + 1 + } else { + a.rom_cmp(b) + } +} + +fn ge_abi(a: F, b: F) -> i32 { + if a.is_nan() || b.is_nan() { + -1 + } else { + a.rom_cmp(b) + } +} + +intrinsics! { + #[slower_than_default] + #[bootrom_v2] + #[alias = __eqsf2, __ltsf2, __nesf2] + extern "C" fn __lesf2(a: f32, b: f32) -> i32 { + le_abi(a, b) + } + + #[slower_than_default] + #[bootrom_v2] + #[alias = __eqdf2, __ltdf2, __nedf2] + extern "C" fn __ledf2(a: f64, b: f64) -> i32 { + le_abi(a, b) + } + + #[slower_than_default] + #[bootrom_v2] + #[alias = __gtsf2] + extern "C" fn __gesf2(a: f32, b: f32) -> i32 { + ge_abi(a, b) + } + + #[slower_than_default] + #[bootrom_v2] + #[alias = __gtdf2] + extern "C" fn __gedf2(a: f64, b: f64) -> i32 { + ge_abi(a, b) + } + + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 { + (le_abi(a, b) <= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 { + (ge_abi(a, b) >= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 { + (le_abi(a, b) == 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 { + (le_abi(a, b) < 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 { + (ge_abi(a, b) > 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 { + (le_abi(a, b) <= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 { + (ge_abi(a, b) >= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 { + (le_abi(a, b) == 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 { + (le_abi(a, b) < 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 { + (ge_abi(a, b) > 0) as i32 + } + + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __gesf2vfp(a: f32, b: f32) -> i32 { + (ge_abi(a, b) >= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __gedf2vfp(a: f64, b: f64) -> i32 { + (ge_abi(a, b) >= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __gtsf2vfp(a: f32, b: f32) -> i32 { + (ge_abi(a, b) > 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __gtdf2vfp(a: f64, b: f64) -> i32 { + (ge_abi(a, b) > 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __ltsf2vfp(a: f32, b: f32) -> i32 { + (le_abi(a, b) < 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __ltdf2vfp(a: f64, b: f64) -> i32 { + (le_abi(a, b) < 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __lesf2vfp(a: f32, b: f32) -> i32 { + (le_abi(a, b) <= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __ledf2vfp(a: f64, b: f64) -> i32 { + (le_abi(a, b) <= 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __nesf2vfp(a: f32, b: f32) -> i32 { + (le_abi(a, b) != 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __nedf2vfp(a: f64, b: f64) -> i32 { + (le_abi(a, b) != 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __eqsf2vfp(a: f32, b: f32) -> i32 { + (le_abi(a, b) == 0) as i32 + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn __eqdf2vfp(a: f64, b: f64) -> i32 { + (le_abi(a, b) == 0) as i32 + } +} diff --git a/embassy-rp/src/float/conv.rs b/embassy-rp/src/float/conv.rs new file mode 100644 index 00000000..021826e2 --- /dev/null +++ b/embassy-rp/src/float/conv.rs @@ -0,0 +1,157 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs + +use super::Float; +use crate::rom_data; + +// Some of these are also not connected in the Pico SDK. This is probably +// because the ROM version actually does a fixed point conversion, just with +// the fractional width set to zero. + +intrinsics! { + // Not connected in the Pico SDK + #[slower_than_default] + #[aeabi = __aeabi_i2f] + extern "C" fn __floatsisf(i: i32) -> f32 { + rom_data::float_funcs::int_to_float(i) + } + + // Not connected in the Pico SDK + #[slower_than_default] + #[aeabi = __aeabi_i2d] + extern "C" fn __floatsidf(i: i32) -> f64 { + rom_data::double_funcs::int_to_double(i) + } + + // Questionable gain + #[aeabi = __aeabi_l2f] + extern "C" fn __floatdisf(i: i64) -> f32 { + rom_data::float_funcs::int64_to_float(i) + } + + #[bootrom_v2] + #[aeabi = __aeabi_l2d] + extern "C" fn __floatdidf(i: i64) -> f64 { + rom_data::double_funcs::int64_to_double(i) + } + + // Not connected in the Pico SDK + #[slower_than_default] + #[aeabi = __aeabi_ui2f] + extern "C" fn __floatunsisf(i: u32) -> f32 { + rom_data::float_funcs::uint_to_float(i) + } + + // Questionable gain + #[bootrom_v2] + #[aeabi = __aeabi_ui2d] + extern "C" fn __floatunsidf(i: u32) -> f64 { + rom_data::double_funcs::uint_to_double(i) + } + + // Questionable gain + #[bootrom_v2] + #[aeabi = __aeabi_ul2f] + extern "C" fn __floatundisf(i: u64) -> f32 { + rom_data::float_funcs::uint64_to_float(i) + } + + #[bootrom_v2] + #[aeabi = __aeabi_ul2d] + extern "C" fn __floatundidf(i: u64) -> f64 { + rom_data::double_funcs::uint64_to_double(i) + } + + + // The Pico SDK does some optimization here (e.x. fast paths for zero and + // one), but we can just directly connect it. + #[aeabi = __aeabi_f2iz] + extern "C" fn __fixsfsi(f: f32) -> i32 { + rom_data::float_funcs::float_to_int(f) + } + + #[bootrom_v2] + #[aeabi = __aeabi_f2lz] + extern "C" fn __fixsfdi(f: f32) -> i64 { + rom_data::float_funcs::float_to_int64(f) + } + + // Not connected in the Pico SDK + #[slower_than_default] + #[bootrom_v2] + #[aeabi = __aeabi_d2iz] + extern "C" fn __fixdfsi(f: f64) -> i32 { + rom_data::double_funcs::double_to_int(f) + } + + // Like with the 32 bit version, there's optimization that we just + // skip. + #[bootrom_v2] + #[aeabi = __aeabi_d2lz] + extern "C" fn __fixdfdi(f: f64) -> i64 { + rom_data::double_funcs::double_to_int64(f) + } + + #[slower_than_default] + #[aeabi = __aeabi_f2uiz] + extern "C" fn __fixunssfsi(f: f32) -> u32 { + rom_data::float_funcs::float_to_uint(f) + } + + #[slower_than_default] + #[bootrom_v2] + #[aeabi = __aeabi_f2ulz] + extern "C" fn __fixunssfdi(f: f32) -> u64 { + rom_data::float_funcs::float_to_uint64(f) + } + + #[slower_than_default] + #[bootrom_v2] + #[aeabi = __aeabi_d2uiz] + extern "C" fn __fixunsdfsi(f: f64) -> u32 { + rom_data::double_funcs::double_to_uint(f) + } + + #[slower_than_default] + #[bootrom_v2] + #[aeabi = __aeabi_d2ulz] + extern "C" fn __fixunsdfdi(f: f64) -> u64 { + rom_data::double_funcs::double_to_uint64(f) + } + + #[bootrom_v2] + #[alias = __extendsfdf2vfp] + #[aeabi = __aeabi_f2d] + extern "C" fn __extendsfdf2(f: f32) -> f64 { + if f.is_not_finite() { + return f64::from_repr( + // Not finite + f64::EXPONENT_MASK | + // Preserve NaN or inf + ((f.repr() & f32::SIGNIFICAND_MASK) as u64) | + // Preserve sign + ((f.repr() & f32::SIGN_MASK) as u64) << (f64::BITS-f32::BITS) + ); + } + rom_data::float_funcs::float_to_double(f) + } + + #[bootrom_v2] + #[alias = __truncdfsf2vfp] + #[aeabi = __aeabi_d2f] + extern "C" fn __truncdfsf2(f: f64) -> f32 { + if f.is_not_finite() { + let mut repr: u32 = + // Not finite + f32::EXPONENT_MASK | + // Preserve sign + ((f.repr() & f64::SIGN_MASK) >> (f64::BITS-f32::BITS)) as u32; + // Set NaN + if (f.repr() & f64::SIGNIFICAND_MASK) != 0 { + repr |= 1; + } + return f32::from_repr(repr); + } + rom_data::double_funcs::double_to_float(f) + } +} diff --git a/embassy-rp/src/float/div.rs b/embassy-rp/src/float/div.rs new file mode 100644 index 00000000..094dec44 --- /dev/null +++ b/embassy-rp/src/float/div.rs @@ -0,0 +1,141 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/conv.rs + +use super::Float; +use crate::rom_data; + +// Make sure this stays as a separate call, because when it's inlined the +// compiler will move the save of the registers used to contain the divider +// state into the function prologue. That save and restore (push/pop) takes +// longer than the actual division, so doing it in the common case where +// they are not required wastes a lot of time. +#[inline(never)] +#[cold] +fn save_divider_and_call(f: F) -> R +where + F: FnOnce() -> R, +{ + let sio = rp_pac::SIO; + + unsafe { + // Since we can't save the signed-ness of the calculation, we have to make + // sure that there's at least an 8 cycle delay before we read the result. + // The Pico SDK ensures this by using a 6 cycle push and two 1 cycle reads. + // Since we can't be sure the Rust implementation will optimize to the same, + // just use an explicit wait. + while !sio.div().csr().read().ready() {} + + // Read the quotient last, since that's what clears the dirty flag + let dividend = sio.div().udividend().read(); + let divisor = sio.div().udivisor().read(); + let remainder = sio.div().remainder().read(); + let quotient = sio.div().quotient().read(); + + // If we get interrupted here (before a write sets the DIRTY flag) its fine, since + // we have the full state, so the interruptor doesn't have to restore it. Once the + // write happens and the DIRTY flag is set, the interruptor becomes responsible for + // restoring our state. + let result = f(); + + // If we are interrupted here, then the interruptor will start an incorrect calculation + // using a wrong divisor, but we'll restore the divisor and result ourselves correctly. + // This sets DIRTY, so any interruptor will save the state. + sio.div().udividend().write_value(dividend); + // If we are interrupted here, the the interruptor may start the calculation using + // incorrectly signed inputs, but we'll restore the result ourselves. + // This sets DIRTY, so any interruptor will save the state. + sio.div().udivisor().write_value(divisor); + // If we are interrupted here, the interruptor will have restored everything but the + // quotient may be wrongly signed. If the calculation started by the above writes is + // still ongoing it is stopped, so it won't replace the result we're restoring. + // DIRTY and READY set, but only DIRTY matters to make the interruptor save the state. + sio.div().remainder().write_value(remainder); + // State fully restored after the quotient write. This sets both DIRTY and READY, so + // whatever we may have interrupted can read the result. + sio.div().quotient().write_value(quotient); + + result + } +} + +fn save_divider(f: F) -> R +where + F: FnOnce() -> R, +{ + let sio = rp_pac::SIO; + if unsafe { !sio.div().csr().read().dirty() } { + // Not dirty, so nothing is waiting for the calculation. So we can just + // issue it directly without a save/restore. + f() + } else { + save_divider_and_call(f) + } +} + +trait ROMDiv { + fn rom_div(self, b: Self) -> Self; +} + +impl ROMDiv for f32 { + fn rom_div(self, b: Self) -> Self { + // ROM implementation uses the hardware divider, so we have to save it + save_divider(|| rom_data::float_funcs::fdiv(self, b)) + } +} + +impl ROMDiv for f64 { + fn rom_div(self, b: Self) -> Self { + // ROM implementation uses the hardware divider, so we have to save it + save_divider(|| rom_data::double_funcs::ddiv(self, b)) + } +} + +fn div(a: F, b: F) -> F { + if a.is_not_finite() { + if b.is_not_finite() { + // inf/NaN / inf/NaN = NaN + return F::NAN; + } + + if b.is_zero() { + // inf/NaN / 0 = NaN + return F::NAN; + } + + return if b.is_sign_negative() { + // [+/-]inf/NaN / (-X) = [-/+]inf/NaN + a.negate() + } else { + // [-]inf/NaN / X = [-]inf/NaN + a + }; + } + + if b.is_nan() { + // X / NaN = NaN + return b; + } + + // ROM handles X / 0 = [-]inf and X / [-]inf = [-]0, so we only + // need to catch 0 / 0 + if b.is_zero() && a.is_zero() { + return F::NAN; + } + + a.rom_div(b) +} + +intrinsics! { + #[alias = __divsf3vfp] + #[aeabi = __aeabi_fdiv] + extern "C" fn __divsf3(a: f32, b: f32) -> f32 { + div(a, b) + } + + #[bootrom_v2] + #[alias = __divdf3vfp] + #[aeabi = __aeabi_ddiv] + extern "C" fn __divdf3(a: f64, b: f64) -> f64 { + div(a, b) + } +} diff --git a/embassy-rp/src/float/functions.rs b/embassy-rp/src/float/functions.rs new file mode 100644 index 00000000..de29ce33 --- /dev/null +++ b/embassy-rp/src/float/functions.rs @@ -0,0 +1,239 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/functions.rs + +use crate::float::{Float, Int}; +use crate::rom_data; + +trait ROMFunctions { + fn sqrt(self) -> Self; + fn ln(self) -> Self; + fn exp(self) -> Self; + fn sin(self) -> Self; + fn cos(self) -> Self; + fn tan(self) -> Self; + fn atan2(self, y: Self) -> Self; + + fn to_trig_range(self) -> Self; +} + +impl ROMFunctions for f32 { + fn sqrt(self) -> Self { + rom_data::float_funcs::fsqrt(self) + } + + fn ln(self) -> Self { + rom_data::float_funcs::fln(self) + } + + fn exp(self) -> Self { + rom_data::float_funcs::fexp(self) + } + + fn sin(self) -> Self { + rom_data::float_funcs::fsin(self) + } + + fn cos(self) -> Self { + rom_data::float_funcs::fcos(self) + } + + fn tan(self) -> Self { + rom_data::float_funcs::ftan(self) + } + + fn atan2(self, y: Self) -> Self { + rom_data::float_funcs::fatan2(self, y) + } + + fn to_trig_range(self) -> Self { + // -128 < X < 128, logic from the Pico SDK + let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS; + if exponent < 134 { + self + } else { + self % (core::f32::consts::PI * 2.0) + } + } +} + +impl ROMFunctions for f64 { + fn sqrt(self) -> Self { + rom_data::double_funcs::dsqrt(self) + } + + fn ln(self) -> Self { + rom_data::double_funcs::dln(self) + } + + fn exp(self) -> Self { + rom_data::double_funcs::dexp(self) + } + + fn sin(self) -> Self { + rom_data::double_funcs::dsin(self) + } + + fn cos(self) -> Self { + rom_data::double_funcs::dcos(self) + } + fn tan(self) -> Self { + rom_data::double_funcs::dtan(self) + } + + fn atan2(self, y: Self) -> Self { + rom_data::double_funcs::datan2(self, y) + } + + fn to_trig_range(self) -> Self { + // -1024 < X < 1024, logic from the Pico SDK + let exponent = (self.repr() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS; + if exponent < 1033 { + self + } else { + self % (core::f64::consts::PI * 2.0) + } + } +} + +fn is_negative_nonzero_or_nan(f: F) -> bool { + let repr = f.repr(); + if (repr & F::SIGN_MASK) != F::Int::ZERO { + // Negative, so anything other than exactly zero + return (repr & (!F::SIGN_MASK)) != F::Int::ZERO; + } + // NaN + (repr & (F::EXPONENT_MASK | F::SIGNIFICAND_MASK)) > F::EXPONENT_MASK +} + +fn sqrt(f: F) -> F { + if is_negative_nonzero_or_nan(f) { + F::NAN + } else { + f.sqrt() + } +} + +fn ln(f: F) -> F { + if is_negative_nonzero_or_nan(f) { + F::NAN + } else { + f.ln() + } +} + +fn exp(f: F) -> F { + if f.is_nan() { + F::NAN + } else { + f.exp() + } +} + +fn sin(f: F) -> F { + if f.is_not_finite() { + F::NAN + } else { + f.to_trig_range().sin() + } +} + +fn cos(f: F) -> F { + if f.is_not_finite() { + F::NAN + } else { + f.to_trig_range().cos() + } +} + +fn tan(f: F) -> F { + if f.is_not_finite() { + F::NAN + } else { + f.to_trig_range().tan() + } +} + +fn atan2(x: F, y: F) -> F { + if x.is_nan() || y.is_nan() { + F::NAN + } else { + x.to_trig_range().atan2(y) + } +} + +// Name collisions +mod intrinsics { + intrinsics! { + extern "C" fn sqrtf(f: f32) -> f32 { + super::sqrt(f) + } + + #[bootrom_v2] + extern "C" fn sqrt(f: f64) -> f64 { + super::sqrt(f) + } + + extern "C" fn logf(f: f32) -> f32 { + super::ln(f) + } + + #[bootrom_v2] + extern "C" fn log(f: f64) -> f64 { + super::ln(f) + } + + extern "C" fn expf(f: f32) -> f32 { + super::exp(f) + } + + #[bootrom_v2] + extern "C" fn exp(f: f64) -> f64 { + super::exp(f) + } + + #[slower_than_default] + extern "C" fn sinf(f: f32) -> f32 { + super::sin(f) + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn sin(f: f64) -> f64 { + super::sin(f) + } + + #[slower_than_default] + extern "C" fn cosf(f: f32) -> f32 { + super::cos(f) + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn cos(f: f64) -> f64 { + super::cos(f) + } + + #[slower_than_default] + extern "C" fn tanf(f: f32) -> f32 { + super::tan(f) + } + + #[slower_than_default] + #[bootrom_v2] + extern "C" fn tan(f: f64) -> f64 { + super::tan(f) + } + + // Questionable gain + #[bootrom_v2] + extern "C" fn atan2f(a: f32, b: f32) -> f32 { + super::atan2(a, b) + } + + // Questionable gain + #[bootrom_v2] + extern "C" fn atan2(a: f64, b: f64) -> f64 { + super::atan2(a, b) + } + } +} diff --git a/embassy-rp/src/float/mod.rs b/embassy-rp/src/float/mod.rs new file mode 100644 index 00000000..945afff9 --- /dev/null +++ b/embassy-rp/src/float/mod.rs @@ -0,0 +1,149 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mod.rs + +use core::ops; + +// Borrowed and simplified from compiler-builtins so we can use bit ops +// on floating point without macro soup. +pub(crate) trait Int: + Copy + + core::fmt::Debug + + PartialEq + + PartialOrd + + ops::AddAssign + + ops::SubAssign + + ops::BitAndAssign + + ops::BitOrAssign + + ops::BitXorAssign + + ops::ShlAssign + + ops::ShrAssign + + ops::Add + + ops::Sub + + ops::Div + + ops::Shl + + ops::Shr + + ops::BitOr + + ops::BitXor + + ops::BitAnd + + ops::Not +{ + const ZERO: Self; +} + +macro_rules! int_impl { + ($ty:ty) => { + impl Int for $ty { + const ZERO: Self = 0; + } + }; +} + +int_impl!(u32); +int_impl!(u64); + +pub(crate) trait Float: + Copy + + core::fmt::Debug + + PartialEq + + PartialOrd + + ops::AddAssign + + ops::MulAssign + + ops::Add + + ops::Sub + + ops::Div + + ops::Rem +{ + /// A uint of the same with as the float + type Int: Int; + + /// NaN representation for the float + const NAN: Self; + + /// The bitwidth of the float type + const BITS: u32; + + /// The bitwidth of the significand + const SIGNIFICAND_BITS: u32; + + /// A mask for the sign bit + const SIGN_MASK: Self::Int; + + /// A mask for the significand + const SIGNIFICAND_MASK: Self::Int; + + /// A mask for the exponent + const EXPONENT_MASK: Self::Int; + + /// Returns `self` transmuted to `Self::Int` + fn repr(self) -> Self::Int; + + /// Returns a `Self::Int` transmuted back to `Self` + fn from_repr(a: Self::Int) -> Self; + + /// Return a sign swapped `self` + fn negate(self) -> Self; + + /// Returns true if `self` is either NaN or infinity + fn is_not_finite(self) -> bool { + (self.repr() & Self::EXPONENT_MASK) == Self::EXPONENT_MASK + } + + /// Returns true if `self` is infinity + fn is_infinity(self) -> bool { + (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) == Self::EXPONENT_MASK + } + + /// Returns true if `self is NaN + fn is_nan(self) -> bool { + (self.repr() & (Self::EXPONENT_MASK | Self::SIGNIFICAND_MASK)) > Self::EXPONENT_MASK + } + + /// Returns true if `self` is negative + fn is_sign_negative(self) -> bool { + (self.repr() & Self::SIGN_MASK) != Self::Int::ZERO + } + + /// Returns true if `self` is zero (either sign) + fn is_zero(self) -> bool { + (self.repr() & (Self::SIGNIFICAND_MASK | Self::EXPONENT_MASK)) == Self::Int::ZERO + } +} + +macro_rules! float_impl { + ($ty:ident, $ity:ident, $bits:expr, $significand_bits:expr) => { + impl Float for $ty { + type Int = $ity; + + const NAN: Self = <$ty>::NAN; + + const BITS: u32 = $bits; + const SIGNIFICAND_BITS: u32 = $significand_bits; + + const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1); + const SIGNIFICAND_MASK: Self::Int = (1 << Self::SIGNIFICAND_BITS) - 1; + const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK); + + fn repr(self) -> Self::Int { + self.to_bits() + } + + fn from_repr(a: Self::Int) -> Self { + Self::from_bits(a) + } + + fn negate(self) -> Self { + -self + } + } + }; +} + +float_impl!(f32, u32, 32, 23); +float_impl!(f64, u64, 64, 52); + +mod add_sub; +mod cmp; +mod conv; +mod div; +mod functions; +mod mul; diff --git a/embassy-rp/src/float/mul.rs b/embassy-rp/src/float/mul.rs new file mode 100644 index 00000000..ceb0210e --- /dev/null +++ b/embassy-rp/src/float/mul.rs @@ -0,0 +1,70 @@ +// Credit: taken from `rp-hal` (also licensed Apache+MIT) +// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/float/mul.rs + +use super::Float; +use crate::rom_data; + +trait ROMMul { + fn rom_mul(self, b: Self) -> Self; +} + +impl ROMMul for f32 { + fn rom_mul(self, b: Self) -> Self { + rom_data::float_funcs::fmul(self, b) + } +} + +impl ROMMul for f64 { + fn rom_mul(self, b: Self) -> Self { + rom_data::double_funcs::dmul(self, b) + } +} + +fn mul(a: F, b: F) -> F { + if a.is_not_finite() { + if b.is_zero() { + // [-]inf/NaN * 0 = NaN + return F::NAN; + } + + return if b.is_sign_negative() { + // [+/-]inf/NaN * (-X) = [-/+]inf/NaN + a.negate() + } else { + // [-]inf/NaN * X = [-]inf/NaN + a + }; + } + + if b.is_not_finite() { + if a.is_zero() { + // 0 * [-]inf/NaN = NaN + return F::NAN; + } + + return if b.is_sign_negative() { + // (-X) * [+/-]inf/NaN = [-/+]inf/NaN + b.negate() + } else { + // X * [-]inf/NaN = [-]inf/NaN + b + }; + } + + a.rom_mul(b) +} + +intrinsics! { + #[alias = __mulsf3vfp] + #[aeabi = __aeabi_fmul] + extern "C" fn __mulsf3(a: f32, b: f32) -> f32 { + mul(a, b) + } + + #[bootrom_v2] + #[alias = __muldf3vfp] + #[aeabi = __aeabi_dmul] + extern "C" fn __muldf3(a: f64, b: f64) -> f64 { + mul(a, b) + } +} diff --git a/embassy-rp/src/lib.rs b/embassy-rp/src/lib.rs index 1d63f6c2..3841bb83 100644 --- a/embassy-rp/src/lib.rs +++ b/embassy-rp/src/lib.rs @@ -12,6 +12,7 @@ mod intrinsics; pub mod adc; pub mod dma; +mod float; pub mod gpio; pub mod i2c; pub mod interrupt; diff --git a/tests/rp/.cargo/config.toml b/tests/rp/.cargo/config.toml index 9611db3a..e1744c70 100644 --- a/tests/rp/.cargo/config.toml +++ b/tests/rp/.cargo/config.toml @@ -1,6 +1,8 @@ [unstable] -build-std = ["core"] -build-std-features = ["panic_immediate_abort"] +# enabling these breaks the float tests during linking, with intrinsics +# duplicated between embassy-rp and compilter_builtins +#build-std = ["core"] +#build-std-features = ["panic_immediate_abort"] [target.'cfg(all(target_arch = "arm", target_os = "none"))'] #runner = "teleprobe client run --target rpi-pico --elf" diff --git a/tests/rp/Cargo.toml b/tests/rp/Cargo.toml index 36ff735e..6778f53d 100644 --- a/tests/rp/Cargo.toml +++ b/tests/rp/Cargo.toml @@ -8,7 +8,7 @@ license = "MIT OR Apache-2.0" embassy-sync = { version = "0.2.0", path = "../../embassy-sync", features = ["defmt"] } embassy-executor = { version = "0.1.0", path = "../../embassy-executor", features = ["arch-cortex-m", "executor-thread", "defmt", "integrated-timers"] } embassy-time = { version = "0.1.0", path = "../../embassy-time", features = ["defmt"] } -embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl"] } +embassy-rp = { version = "0.1.0", path = "../../embassy-rp", features = ["nightly", "defmt", "unstable-pac", "unstable-traits", "time-driver", "critical-section-impl", "intrinsics", "rom-v2-intrinsics"] } embassy-futures = { version = "0.1.0", path = "../../embassy-futures" } defmt = "0.3.0" diff --git a/tests/rp/src/bin/float.rs b/tests/rp/src/bin/float.rs new file mode 100644 index 00000000..6715271e --- /dev/null +++ b/tests/rp/src/bin/float.rs @@ -0,0 +1,53 @@ +#![no_std] +#![no_main] +#![feature(type_alias_impl_trait)] + +use defmt::*; +use embassy_executor::Spawner; +use embassy_rp::pac; +use embassy_time::{Duration, Timer}; +use {defmt_rtt as _, panic_probe as _}; + +#[embassy_executor::main] +async fn main(_spawner: Spawner) { + embassy_rp::init(Default::default()); + info!("Hello World!"); + + const PI_F: f32 = 3.1415926535f32; + const PI_D: f64 = 3.14159265358979323846f64; + + unsafe { + pac::BUSCTRL + .perfsel(0) + .write(|r| r.set_perfsel(pac::busctrl::vals::Perfsel::ROM)); + } + + for i in 0..=360 { + let rad_f = (i as f32) * PI_F / 180.0; + info!( + "{}° float: {=f32} / {=f32} / {=f32} / {=f32}", + i, + rad_f, + rad_f - PI_F, + rad_f + PI_F, + rad_f % PI_F + ); + let rad_d = (i as f64) * PI_D / 180.0; + info!( + "{}° double: {=f64} / {=f64} / {=f64} / {=f64}", + i, + rad_d, + rad_d - PI_D, + rad_d + PI_D, + rad_d % PI_D + ); + Timer::after(Duration::from_millis(10)).await; + } + + let rom_accesses = unsafe { pac::BUSCTRL.perfctr(0).read().perfctr() }; + // every float operation used here uses at least 10 cycles + defmt::assert!(rom_accesses >= 360 * 12 * 10); + + info!("Test OK"); + cortex_m::asm::bkpt(); +} From 64b80c2e4d390b16a384e197f1c7ed2a9a6fc946 Mon Sep 17 00:00:00 2001 From: xoviat Date: Wed, 19 Apr 2023 16:16:44 -0500 Subject: [PATCH 3/6] stm32/i2c: ignore wakes without interrupt --- embassy-stm32/src/i2c/v2.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/embassy-stm32/src/i2c/v2.rs b/embassy-stm32/src/i2c/v2.rs index 44237b89..92fc0559 100644 --- a/embassy-stm32/src/i2c/v2.rs +++ b/embassy-stm32/src/i2c/v2.rs @@ -485,6 +485,7 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { poll_fn(|cx| { state.waker.register(cx.waker()); + let isr = unsafe { T::regs().isr().read() }; if remaining_len == total_len { // NOTE(unsafe) self.tx_dma does not fiddle with the i2c registers if first_slice { @@ -503,6 +504,9 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { T::regs().cr1().modify(|w| w.set_tcie(true)); } } + } else if !(isr.tcr() || isr.tc()) { + // poll_fn was woken without an interrupt present + return Poll::Pending; } else if remaining_len == 0 { return Poll::Ready(Ok(())); } else { @@ -575,6 +579,8 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { poll_fn(|cx| { state.waker.register(cx.waker()); + + let isr = unsafe { T::regs().isr().read() }; if remaining_len == total_len { // NOTE(unsafe) self.rx_dma does not fiddle with the i2c registers unsafe { @@ -587,6 +593,9 @@ impl<'d, T: Instance, TXDMA, RXDMA> I2c<'d, T, TXDMA, RXDMA> { &check_timeout, )?; } + } else if !(isr.tcr() || isr.tc()) { + // poll_fn was woken without an interrupt present + return Poll::Pending; } else if remaining_len == 0 { return Poll::Ready(Ok(())); } else { From 837cdacd16eaf10c70758f413867c251e68460d0 Mon Sep 17 00:00:00 2001 From: pennae Date: Wed, 19 Apr 2023 21:56:18 +0200 Subject: [PATCH 4/6] rp: optimize rom-func-cache for runtime storing a full function pointer initialized to a resolver trampoline lets us avoid the runtime cost of checking whether we need to do the initialization. --- embassy-rp/src/rom_data.rs | 117 +++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 62 deletions(-) diff --git a/embassy-rp/src/rom_data.rs b/embassy-rp/src/rom_data.rs index 757a2711..805c1f09 100644 --- a/embassy-rp/src/rom_data.rs +++ b/embassy-rp/src/rom_data.rs @@ -56,50 +56,11 @@ macro_rules! declare_rom_function { fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty $lookup:block ) => { - #[doc = r"Additional access for the `"] - #[doc = stringify!($name)] - #[doc = r"` ROM function."] - pub mod $name { - /// Retrieve a function pointer. - #[cfg(not(feature = "rom-func-cache"))] - pub fn ptr() -> extern "C" fn( $($argname: $ty),* ) -> $ret { - let p: *const u32 = $lookup; - unsafe { - let func : extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p); - func - } - } - - /// Retrieve a function pointer. - #[cfg(feature = "rom-func-cache")] - pub fn ptr() -> extern "C" fn( $($argname: $ty),* ) -> $ret { - use core::sync::atomic::{AtomicU16, Ordering}; - - // All pointers in the ROM fit in 16 bits, so we don't need a - // full width word to store the cached value. - static CACHED_PTR: AtomicU16 = AtomicU16::new(0); - // This is safe because the lookup will always resolve - // to the same value. So even if an interrupt or another - // core starts at the same time, it just repeats some - // work and eventually writes back the correct value. - let p: *const u32 = match CACHED_PTR.load(Ordering::Relaxed) { - 0 => { - let raw: *const u32 = $lookup; - CACHED_PTR.store(raw as u16, Ordering::Relaxed); - raw - }, - val => val as *const u32, - }; - unsafe { - let func : extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p); - func - } - } - } - - $(#[$outer])* - pub extern "C" fn $name( $($argname: $ty),* ) -> $ret { - $name::ptr()($($argname),*) + declare_rom_function!{ + __internal , + $(#[$outer])* + fn $name( $($argname: $ty),* ) -> $ret + $lookup } }; @@ -107,6 +68,21 @@ macro_rules! declare_rom_function { $(#[$outer:meta])* unsafe fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty $lookup:block + ) => { + declare_rom_function!{ + __internal unsafe , + $(#[$outer])* + fn $name( $($argname: $ty),* ) -> $ret + $lookup + } + }; + + ( + __internal + $( $maybe_unsafe:ident )? , + $(#[$outer:meta])* + fn $name:ident( $($argname:ident: $ty:ty),* ) -> $ret:ty + $lookup:block ) => { #[doc = r"Additional access for the `"] #[doc = stringify!($name)] @@ -114,43 +90,58 @@ macro_rules! declare_rom_function { pub mod $name { /// Retrieve a function pointer. #[cfg(not(feature = "rom-func-cache"))] - pub fn ptr() -> unsafe extern "C" fn( $($argname: $ty),* ) -> $ret { + pub fn ptr() -> $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret { let p: *const u32 = $lookup; unsafe { - let func : unsafe extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p); + let func : $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret + = core::mem::transmute(p); func } } + #[cfg(feature = "rom-func-cache")] + // unlike rp2040-hal we store a full word, containing the full function pointer. + // rp2040-hal saves two bytes by storing only the rom offset, at the cost of + // having to do an indirection and an atomic operation on every rom call. + static mut CACHE: $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret + = trampoline; + + #[cfg(feature = "rom-func-cache")] + $( $maybe_unsafe )? extern "C" fn trampoline( $($argname: $ty),* ) -> $ret { + use core::sync::atomic::{compiler_fence, Ordering}; + + let p: *const u32 = $lookup; + #[allow(unused_unsafe)] + unsafe { + CACHE = core::mem::transmute(p); + compiler_fence(Ordering::Release); + CACHE($($argname),*) + } + } + /// Retrieve a function pointer. #[cfg(feature = "rom-func-cache")] - pub fn ptr() -> unsafe extern "C" fn( $($argname: $ty),* ) -> $ret { - use core::sync::atomic::{AtomicU16, Ordering}; + pub fn ptr() -> $( $maybe_unsafe )? extern "C" fn( $($argname: $ty),* ) -> $ret { + use core::sync::atomic::{compiler_fence, Ordering}; - // All pointers in the ROM fit in 16 bits, so we don't need a - // full width word to store the cached value. - static CACHED_PTR: AtomicU16 = AtomicU16::new(0); // This is safe because the lookup will always resolve // to the same value. So even if an interrupt or another // core starts at the same time, it just repeats some // work and eventually writes back the correct value. - let p: *const u32 = match CACHED_PTR.load(Ordering::Relaxed) { - 0 => { - let raw: *const u32 = $lookup; - CACHED_PTR.store(raw as u16, Ordering::Relaxed); - raw - }, - val => val as *const u32, - }; + // + // We easily get away with using only compiler fences here + // because RP2040 SRAM is not cached. If it were we'd need + // to make sure updates propagate quickly, or just take the + // hit and let each core resolve every function once. + compiler_fence(Ordering::Acquire); unsafe { - let func : unsafe extern "C" fn( $($argname: $ty),* ) -> $ret = core::mem::transmute(p); - func + CACHE } } } $(#[$outer])* - pub unsafe extern "C" fn $name( $($argname: $ty),* ) -> $ret { + pub $( $maybe_unsafe )? extern "C" fn $name( $($argname: $ty),* ) -> $ret { $name::ptr()($($argname),*) } }; @@ -369,6 +360,7 @@ pub fn fplib_start() -> *const u8 { } /// See Table 180 in the RP2040 datasheet for the contents of this table. +#[cfg_attr(feature = "rom-func-cache", inline(never))] pub fn soft_float_table() -> *const usize { rom_table_lookup(DATA_TABLE, *b"SF") } @@ -379,6 +371,7 @@ pub fn fplib_end() -> *const u8 { } /// This entry is only present in the V2 bootrom. See Table 182 in the RP2040 datasheet for the contents of this table. +#[cfg_attr(feature = "rom-func-cache", inline(never))] pub fn soft_double_table() -> *const usize { if rom_version_number() < 2 { panic!( From 5de6bb3adfa5a82ec764f81409bc60e42fc26d20 Mon Sep 17 00:00:00 2001 From: Lachezar Lechev Date: Thu, 20 Apr 2023 09:19:26 +0300 Subject: [PATCH 5/6] feat: add embassy-boot-rp to the doc builder Signed-off-by: Lachezar Lechev --- .github/workflows/doc.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index cb222803..411b7589 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -61,6 +61,7 @@ jobs: mkdir crates builder ./embassy-boot/boot crates/embassy-boot/git.zup builder ./embassy-boot/nrf crates/embassy-boot-nrf/git.zup + builder ./embassy-boot/rp crates/embassy-boot-rp/git.zup builder ./embassy-boot/stm32 crates/embassy-boot-stm32/git.zup builder ./embassy-cortex-m crates/embassy-cortex-m/git.zup builder ./embassy-embedded-hal crates/embassy-embedded-hal/git.zup @@ -84,5 +85,3 @@ jobs: echo "${{secrets.KUBECONFIG}}" > ~/.kube/config POD=$(kubectl -n embassy get po -l app=docserver -o jsonpath={.items[0].metadata.name}) kubectl cp crates $POD:/data - - \ No newline at end of file From f67eb84ec73c74170de37a15f26cd9d82756d706 Mon Sep 17 00:00:00 2001 From: Lachezar Lechev Date: Thu, 20 Apr 2023 09:20:02 +0300 Subject: [PATCH 6/6] chore: add embassy-boot-rp to README Signed-off-by: Lachezar Lechev --- embassy-boot/boot/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/embassy-boot/boot/README.md b/embassy-boot/boot/README.md index 41440537..56f4a627 100644 --- a/embassy-boot/boot/README.md +++ b/embassy-boot/boot/README.md @@ -13,6 +13,7 @@ By design, the bootloader does not provide any network capabilities. Networking The bootloader supports different hardware in separate crates: * `embassy-boot-nrf` - for the nRF microcontrollers. +* `embassy-boot-rp` - for the RP2040 microcontrollers. * `embassy-boot-stm32` - for the STM32 microcontrollers. ## Minimum supported Rust version (MSRV)