rp: add division intrinsics

rp2040-hal adds division intrinsics using the hardware divider unit in
the SIO, as does the pico-sdk itself. using the hardware is faster than
the compiler_rt implementations, and more compact too.
This commit is contained in:
pennae 2023-04-16 19:39:32 +02:00
parent 1fdce6e52a
commit 7a682ec02a

View File

@ -274,3 +274,201 @@ macro_rules! intrinsics {
intrinsics!($($rest)*); intrinsics!($($rest)*);
}; };
} }
// Credit: taken from `rp-hal` (also licensed Apache+MIT)
// https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/sio.rs
// This takes advantage of how AAPCS defines a 64-bit return on 32-bit registers
// by packing it into r0[0:31] and r1[32:63]. So all we need to do is put
// the remainder in the high order 32 bits of a 64 bit result. We can also
// alias the division operators to these for a similar reason r0 is the
// result either way and r1 a scratch register, so the caller can't assume it
// retains the argument value.
#[cfg(target_arch = "arm")]
core::arch::global_asm!(
".macro hwdivider_head",
"ldr r2, =(0xd0000000)", // SIO_BASE
// Check the DIRTY state of the divider by shifting it into the C
// status bit.
"ldr r3, [r2, #0x078]", // DIV_CSR
"lsrs r3, #2", // DIRTY = 1, so shift 2 down
// We only need to save the state when DIRTY, otherwise we can just do the
// division directly.
"bcs 2f",
"1:",
// Do the actual division now, we're either not DIRTY, or we've saved the
// state and branched back here so it's safe now.
".endm",
".macro hwdivider_tail",
// 8 cycle delay to wait for the result. Each branch takes two cycles
// and fits into a 2-byte Thumb instruction, so this is smaller than
// 8 NOPs.
"b 3f",
"3: b 3f",
"3: b 3f",
"3: b 3f",
"3:",
// Read the quotient last, since that's what clears the dirty flag.
"ldr r1, [r2, #0x074]", // DIV_REMAINDER
"ldr r0, [r2, #0x070]", // DIV_QUOTIENT
// Either return to the caller or back to the state restore.
"bx lr",
"2:",
// Since we can't save the signed-ness of the calculation, we have to make
// sure that there's at least an 8 cycle delay before we read the result.
// The push takes 5 cycles, and we've already spent at least 7 checking
// the DIRTY state to get here.
"push {{r4-r6, lr}}",
// Read the quotient last, since that's what clears the dirty flag.
"ldr r3, [r2, #0x060]", // DIV_UDIVIDEND
"ldr r4, [r2, #0x064]", // DIV_UDIVISOR
"ldr r5, [r2, #0x074]", // DIV_REMAINDER
"ldr r6, [r2, #0x070]", // DIV_QUOTIENT
// If we get interrupted here (before a write sets the DIRTY flag) it's
// fine, since we have the full state, so the interruptor doesn't have to
// restore it. Once the write happens and the DIRTY flag is set, the
// interruptor becomes responsible for restoring our state.
"bl 1b",
// If we are interrupted here, then the interruptor will start an incorrect
// calculation using a wrong divisor, but we'll restore the divisor and
// result ourselves correctly. This sets DIRTY, so any interruptor will
// save the state.
"str r3, [r2, #0x060]", // DIV_UDIVIDEND
// If we are interrupted here, the the interruptor may start the
// calculation using incorrectly signed inputs, but we'll restore the
// result ourselves. This sets DIRTY, so any interruptor will save the
// state.
"str r4, [r2, #0x064]", // DIV_UDIVISOR
// If we are interrupted here, the interruptor will have restored
// everything but the quotient may be wrongly signed. If the calculation
// started by the above writes is still ongoing it is stopped, so it won't
// replace the result we're restoring. DIRTY and READY set, but only
// DIRTY matters to make the interruptor save the state.
"str r5, [r2, #0x074]", // DIV_REMAINDER
// State fully restored after the quotient write. This sets both DIRTY
// and READY, so whatever we may have interrupted can read the result.
"str r6, [r2, #0x070]", // DIV_QUOTIENT
"pop {{r4-r6, pc}}",
".endm",
);
macro_rules! division_function {
(
$name:ident $($intrinsic:ident)* ( $argty:ty ) {
$($begin:literal),+
}
) => {
#[cfg(all(target_arch = "arm", feature = "intrinsics"))]
core::arch::global_asm!(
// Mangle the name slightly, since this is a global symbol.
concat!(".global _rphal_", stringify!($name)),
concat!(".type _rphal_", stringify!($name), ", %function"),
".align 2",
concat!("_rphal_", stringify!($name), ":"),
$(
concat!(".global ", stringify!($intrinsic)),
concat!(".type ", stringify!($intrinsic), ", %function"),
concat!(stringify!($intrinsic), ":"),
)*
"hwdivider_head",
$($begin),+ ,
"hwdivider_tail",
);
#[cfg(all(target_arch = "arm", not(feature = "intrinsics")))]
core::arch::global_asm!(
// Mangle the name slightly, since this is a global symbol.
concat!(".global _rphal_", stringify!($name)),
concat!(".type _rphal_", stringify!($name), ", %function"),
".align 2",
concat!("_rphal_", stringify!($name), ":"),
"hwdivider_head",
$($begin),+ ,
"hwdivider_tail",
);
#[cfg(target_arch = "arm")]
extern "aapcs" {
// Connect a local name to global symbol above through FFI.
#[link_name = concat!("_rphal_", stringify!($name)) ]
fn $name(n: $argty, d: $argty) -> u64;
}
#[cfg(not(target_arch = "arm"))]
#[allow(unused_variables)]
unsafe fn $name(n: $argty, d: $argty) -> u64 { 0 }
};
}
division_function! {
unsigned_divmod __aeabi_uidivmod __aeabi_uidiv ( u32 ) {
"str r0, [r2, #0x060]", // DIV_UDIVIDEND
"str r1, [r2, #0x064]" // DIV_UDIVISOR
}
}
division_function! {
signed_divmod __aeabi_idivmod __aeabi_idiv ( i32 ) {
"str r0, [r2, #0x068]", // DIV_SDIVIDEND
"str r1, [r2, #0x06c]" // DIV_SDIVISOR
}
}
fn divider_unsigned(n: u32, d: u32) -> DivResult<u32> {
let packed = unsafe { unsigned_divmod(n, d) };
DivResult {
quotient: packed as u32,
remainder: (packed >> 32) as u32,
}
}
fn divider_signed(n: i32, d: i32) -> DivResult<i32> {
let packed = unsafe { signed_divmod(n, d) };
// Double casts to avoid sign extension
DivResult {
quotient: packed as u32 as i32,
remainder: (packed >> 32) as u32 as i32,
}
}
/// Result of divide/modulo operation
struct DivResult<T> {
/// The quotient of divide/modulo operation
pub quotient: T,
/// The remainder of divide/modulo operation
pub remainder: T,
}
intrinsics! {
extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
divider_unsigned(n, d).quotient
}
extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
divider_unsigned(n, d).remainder
}
extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
let quo_rem = divider_unsigned(n, d);
if let Some(rem) = rem {
*rem = quo_rem.remainder;
}
quo_rem.quotient
}
extern "C" fn __divsi3(n: i32, d: i32) -> i32 {
divider_signed(n, d).quotient
}
extern "C" fn __modsi3(n: i32, d: i32) -> i32 {
divider_signed(n, d).remainder
}
extern "C" fn __divmodsi4(n: i32, d: i32, rem: &mut i32) -> i32 {
let quo_rem = divider_signed(n, d);
*rem = quo_rem.remainder;
quo_rem.quotient
}
}