Skip to content

Commit 7215515

Browse files
committed
Add wait_ns API
This provides the ability to generate really small delays - it's often the case that wait_us() takes multiple microseconds to set up, so having an alternative suitable for <10us delays is useful. There have been a few local implementations - it makes sense to centralise them as they need retuning for each new ARM core. Based on the local implementation inside the Atmel 802.15.4 driver.
1 parent b08ddaa commit 7215515

File tree

2 files changed

+128
-3
lines changed

2 files changed

+128
-3
lines changed

platform/mbed_wait_api.h

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,43 @@ void wait_ms(int ms);
7878
*
7979
* @note
8080
* This function always spins to get the exact number of microseconds.
81-
* If RTOS is present, this will affect power (by preventing deep sleep) and
82-
* multithread performance. Therefore, spinning for millisecond wait is not recommended.
81+
* This will affect power and multithread performance. Therefore, spinning for
82+
* millisecond wait is not recommended, and wait_ms() should
83+
* be used instead.
84+
*
85+
* @note You may call this function from ISR context, but large delays may
86+
* impact system stability - interrupt handlers should take less than
87+
* 50us.
8388
*/
8489
void wait_us(int us);
8590

91+
/** Waits a number of nanoseconds.
92+
*
93+
* This function spins the CPU to produce a small delay. It should normally
94+
* only be used for delays of 10us (10000ns) or less. As it is calculated
95+
* based on the expected execution time of a software loop, it may well run
96+
* slower than requested based on activity from other threads and interrupts.
97+
* If greater precision is required, this can be called from inside a critical
98+
* section.
99+
*
100+
* @param ns the number of nanoseconds to wait
101+
*
102+
* @note
103+
* wait_us() will likely give more precise time than wait_ns for large-enough
104+
* delays, as it is based on a timer, but its set-up time may be excessive
105+
* for the smallest microsecond counts, at which point wait_ns() is better.
106+
*
107+
* @note
108+
* Any delay larger than a millisecond (1000000ns) is liable to cause
109+
* overflow in the internal loop calculation. You shouldn't normally be
110+
* using this for such large delays anyway in real code, but be aware if
111+
* calibrating. Make repeated calls for longer test runs.
112+
*
113+
* @note You may call this function from ISR context.
114+
*
115+
*/
116+
void wait_ns(unsigned int ns);
117+
86118
#ifdef __cplusplus
87119
}
88120
#endif

platform/mbed_wait_api_no_rtos.c

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
* limitations under the License.
1616
*/
1717

18+
#include "cmsis.h"
19+
#include "platform/mbed_wait_api.h"
20+
1821
// This implementation of the wait functions will be compiled only
1922
// if the RTOS is not present.
2023
#ifndef MBED_CONF_RTOS_PRESENT
2124

22-
#include "platform/mbed_wait_api.h"
2325
#include "hal/us_ticker_api.h"
2426

2527
void wait(float s)
@@ -41,3 +43,94 @@ void wait_us(int us)
4143

4244
#endif // #ifndef MBED_CONF_RTOS_PRESENT
4345

46+
// This wait_ns is used by both RTOS and non-RTOS builds
47+
48+
#ifdef __CORTEX_M
49+
#if (__CORTEX_M == 0 && !defined __CM0PLUS_REV) || __CORTEX_M == 1
50+
// Cortex-M0 and Cortex-M1 take 7 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
51+
#define LOOP_SCALER 6000
52+
#elif (__CORTEX_M == 0 && defined __CM0PLUS_REV) || __CORTEX_M == 3 || __CORTEX_M == 4 || \
53+
__CORTEX_M == 23 || __CORTEX_M == 33
54+
// Cortex-M0+, M3, M4, M23 and M33 take 6 cycles per iteration - SUBS = 1, 3xNOP = 2, BCS = 2
55+
// TODO - check M33
56+
#define LOOP_SCALER 5000
57+
#elif __CORTEX_M == 7
58+
// Cortex-M7 manages to dual-issue for 2 cycles per iteration (SUB,NOP) = 1, (NOP,BCS) = 1
59+
// (The NOPs were added to stabilise this - with just the SUB and BCS, it seems that the
60+
// M7 sometimes takes 1 cycle, sometimes 2, possibly depending on alignment)
61+
#define LOOP_SCALER 2000
62+
#endif
63+
#elif defined __CORTEX_A
64+
#if __CORTEX_A == 9
65+
// Cortex-A9 is dual-issue, so let's assume same performance as Cortex-M7.
66+
// TODO - test.
67+
#define LOOP_SCALER 2000
68+
#endif
69+
#endif
70+
71+
/* We only define the function if we've identified the CPU. If we haven't,
72+
* rather than a compile-time error, leave it undefined, rather than faulting
73+
* with an immediate #error. This leaves the door open to non-ARM
74+
* builds with or people providing substitutes for other CPUs, and only if
75+
* needed.
76+
*/
77+
#ifdef LOOP_SCALER
78+
79+
// *INDENT-OFF*
80+
#ifdef __CC_ARM /* ARMC5 */
81+
__asm static void delay_loop(uint32_t count)
82+
{
83+
1
84+
SUBS a1, a1, #1
85+
NOP
86+
NOP
87+
BCS %BT1
88+
BX lr
89+
}
90+
#elif defined (__ICCARM__)
91+
static void delay_loop(uint32_t count)
92+
{
93+
__asm volatile(
94+
"loop: \n"
95+
" SUBS %0, %0, #1 \n"
96+
" NOP\n"
97+
" NOP\n"
98+
" BCS.n loop\n"
99+
: "+r" (count)
100+
:
101+
: "cc"
102+
);
103+
}
104+
#else // GCC or ARMC6
105+
static void delay_loop(uint32_t count)
106+
{
107+
__asm__ volatile (
108+
"%=:\n\t"
109+
/* Only GCC insists on non-UAL assembly for Thumb v1 */
110+
#if !defined(__ARMCC_VERSION) && defined(__thumb__) && !defined(__thumb2__)
111+
"SUB %0, #1\n\t"
112+
#else
113+
"SUBS %0, %0, #1\n\t"
114+
#endif
115+
"NOP\n\t"
116+
"NOP\n\t"
117+
"BCS %=b\n\t"
118+
: "+l" (count)
119+
:
120+
: "cc"
121+
);
122+
}
123+
#endif
124+
// *INDENT-ON*
125+
126+
void wait_ns(unsigned int ns)
127+
{
128+
uint32_t cycles_per_us = SystemCoreClock / 1000000;
129+
// Note that this very calculation, plus call overhead, will take multiple
130+
// cycles. Could well be 100ns on its own... So round down here, startup is
131+
// worth at least one loop iteration.
132+
uint32_t count = (cycles_per_us * ns) / LOOP_SCALER;
133+
134+
delay_loop(count);
135+
}
136+
#endif // LOOP_SCALER

0 commit comments

Comments
 (0)