428: executor: Use critical sections instead of atomic CAS loops r=lulf a=Dirbaio

Optimize executor wakes.

CAS loops (either `fetch_update`, or manual `load + compare_exchange_weak`) generate surprisingly horrible code: https://godbolt.org/z/zhscnM1cb

This switches to using critical sections, which makes it faster. On thumbv6 (Cortex-M0) it should make it even faster, as it is currently using `atomic-polyfill`, which will make many critical sections for each `compare_exchange_weak` anyway.

```
            opt-level=3   opt-level=s
   atmics:  105 cycles    101 cycles
       CS:   76 cycles     72 cycles
CS+inline:   72 cycles     64 cycles
```

Measured in nrf52 with icache disabled, with this code:

```rust


    poll_fn(|cx| {
        let task = unsafe { task_from_waker(cx.waker()) };

        compiler_fence(Ordering::SeqCst);
        let a = cortex_m::peripheral::DWT::get_cycle_count();
        compiler_fence(Ordering::SeqCst);

        unsafe { wake_task(task) }

        compiler_fence(Ordering::SeqCst);
        let b = cortex_m::peripheral::DWT::get_cycle_count();
        compiler_fence(Ordering::SeqCst);

        defmt::info!("cycles: {=u32}", b.wrapping_sub(a));

        Poll::Ready(())
    })
    .await;
````

Co-authored-by: Dario Nieuwenhuis <dirbaio@dirbaio.net>
This commit is contained in:
bors[bot] 2021-10-18 12:05:43 +00:00 committed by GitHub
commit 729b17bc25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 34 deletions

View File

@ -20,6 +20,7 @@ use core::pin::Pin;
use core::ptr::NonNull;
use core::task::{Context, Poll};
use core::{mem, ptr};
use critical_section::CriticalSection;
use self::run_queue::{RunQueue, RunQueueItem};
use self::util::UninitCell;
@ -71,30 +72,22 @@ impl TaskHeader {
}
pub(crate) unsafe fn enqueue(&self) {
let mut current = self.state.load(Ordering::Acquire);
loop {
critical_section::with(|cs| {
let state = self.state.load(Ordering::Relaxed);
// If already scheduled, or if not started,
if (current & STATE_RUN_QUEUED != 0) || (current & STATE_SPAWNED == 0) {
if (state & STATE_RUN_QUEUED != 0) || (state & STATE_SPAWNED == 0) {
return;
}
// Mark it as scheduled
let new = current | STATE_RUN_QUEUED;
match self.state.compare_exchange_weak(
current,
new,
Ordering::AcqRel,
Ordering::Acquire,
) {
Ok(_) => break,
Err(next_current) => current = next_current,
}
}
self.state
.store(state | STATE_RUN_QUEUED, Ordering::Relaxed);
// We have just marked the task as scheduled, so enqueue it.
let executor = &*self.executor.get();
executor.enqueue(self as *const TaskHeader as *mut TaskHeader);
executor.enqueue(cs, self as *const TaskHeader as *mut TaskHeader);
})
}
}
@ -264,8 +257,9 @@ impl Executor {
/// - `task` must be a valid pointer to a spawned task.
/// - `task` must be set up to run in this executor.
/// - `task` must NOT be already enqueued (in this executor or another one).
unsafe fn enqueue(&self, task: *mut TaskHeader) {
if self.run_queue.enqueue(task) {
#[inline(always)]
unsafe fn enqueue(&self, cs: CriticalSection, task: *mut TaskHeader) {
if self.run_queue.enqueue(cs, task) {
(self.signal_fn)(self.signal_ctx)
}
}
@ -282,7 +276,10 @@ impl Executor {
pub(super) unsafe fn spawn(&'static self, task: NonNull<TaskHeader>) {
let task = task.as_ref();
task.executor.set(self);
self.enqueue(task as *const _ as _);
critical_section::with(|cs| {
self.enqueue(cs, task as *const _ as _);
})
}
/// Poll all queued tasks in this executor.

View File

@ -1,6 +1,7 @@
use atomic_polyfill::{AtomicPtr, Ordering};
use core::ptr;
use core::ptr::NonNull;
use critical_section::CriticalSection;
use super::TaskHeader;
@ -43,19 +44,11 @@ impl RunQueue {
/// # Safety
///
/// `item` must NOT be already enqueued in any queue.
pub(crate) unsafe fn enqueue(&self, task: *mut TaskHeader) -> bool {
let mut prev = self.head.load(Ordering::Acquire);
loop {
#[inline(always)]
pub(crate) unsafe fn enqueue(&self, _cs: CriticalSection, task: *mut TaskHeader) -> bool {
let prev = self.head.load(Ordering::Relaxed);
(*task).run_queue_item.next.store(prev, Ordering::Relaxed);
match self
.head
.compare_exchange_weak(prev, task, Ordering::AcqRel, Ordering::Acquire)
{
Ok(_) => break,
Err(next_prev) => prev = next_prev,
}
}
self.head.store(task, Ordering::Relaxed);
prev.is_null()
}