Add wait_ns API

kjbracey · kjbracey · commit 72155158802e · 2019-02-26T17:30:51.000+02:00
This provides the ability to generate really small delays - it's often
the case that wait_us() takes multiple microseconds to set up, so
having an alternative suitable for &lt;10us delays is useful.

There have been a few local implementations - it makes sense to
centralise them as they need retuning for each new ARM core.

Based on the local implementation inside the Atmel 802.15.4 driver.
diff --git a/platform/mbed_wait_api.h b/platform/mbed_wait_api.h
@@ -78,11 +78,43 @@ void wait_ms(int ms);
  *
  *  @note
  *    This function always spins to get the exact number of microseconds.
- *    If RTOS is present, this will affect power (by preventing deep sleep) and
- *    multithread performance. Therefore, spinning for millisecond wait is not recommended.
+ *    This will affect power and multithread performance. Therefore, spinning for
+ *    millisecond wait is not recommended, and wait_ms() should
+ *    be used instead.
+ *
+ *  @note You may call this function from ISR context, but large delays may
+ *    impact system stability - interrupt handlers should take less than
+ *    50us.
  */
 void wait_us(int us);
 
+/** Waits a number of nanoseconds.
+ *
+ * This function spins the CPU to produce a small delay. It should normally
+ * only be used for delays of 10us (10000ns) or less. As it is calculated
+ * based on the expected execution time of a software loop, it may well run
+ * slower than requested based on activity from other threads and interrupts.
+ * If greater precision is required, this can be called from inside a critical
+ * section.
+ *
+ *  @param ns the number of nanoseconds to wait
+ *
+ *  @note
+ *    wait_us() will likely give more precise time than wait_ns for large-enough
+ *    delays, as it is based on a timer, but its set-up time may be excessive
+ *    for the smallest microsecond counts, at which point wait_ns() is better.
+ *
+ *  @note
+ *    Any delay larger than a millisecond (1000000ns) is liable to cause
+ *    overflow in the internal loop calculation. You shouldn't normally be
+ *    using this for such large delays anyway in real code, but be aware if
+ *    calibrating. Make repeated calls for longer test runs.
+ *
+ *  @note You may call this function from ISR context.
+ *
+ */
+void wait_ns(unsigned int ns);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/platform/mbed_wait_api_no_rtos.c b/platform/mbed_wait_api_no_rtos.c
@@ -15,11 +15,13 @@
  * limitations under the License.
  */
 
+#include "cmsis.h"
+#include "platform/mbed_wait_api.h"
+
 // This implementation of the wait functions will be compiled only
 // if the RTOS is not present.
 #ifndef MBED_CONF_RTOS_PRESENT
 
-#include "platform/mbed_wait_api.h"
 #include "hal/us_ticker_api.h"
 
 void wait(float s)
@@ -41,3 +43,94 @@ void wait_us(int us)
 
 #endif // #ifndef MBED_CONF_RTOS_PRESENT
 
+// This wait_ns is used by both RTOS and non-RTOS builds
+
+#ifdef __CORTEX_M
+#if (__CORTEX_M == 0 && !defined __CM0PLUS_REV) || __CORTEX_M == 1
+// Cortex-M0 and Cortex-M1 take 7 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
+#define LOOP_SCALER 6000
+#elif (__CORTEX_M == 0 && defined __CM0PLUS_REV) || __CORTEX_M == 3 || __CORTEX_M == 4 || \
+      __CORTEX_M == 23 || __CORTEX_M == 33
+// Cortex-M0+, M3, M4, M23 and M33 take 6 cycles per iteration - SUBS = 1, 3xNOP = 2, BCS = 2
+// TODO - check M33
+#define LOOP_SCALER 5000
+#elif __CORTEX_M == 7
+// Cortex-M7 manages to dual-issue for 2 cycles per iteration (SUB,NOP) = 1, (NOP,BCS) = 1
+// (The NOPs were added to stabilise this - with just the SUB and BCS, it seems that the
+// M7 sometimes takes 1 cycle, sometimes 2, possibly depending on alignment)
+#define LOOP_SCALER 2000
+#endif
+#elif defined __CORTEX_A
+#if __CORTEX_A == 9
+// Cortex-A9 is dual-issue, so let's assume same performance as Cortex-M7.
+// TODO - test.
+#define LOOP_SCALER 2000
+#endif
+#endif
+
+/* We only define the function if we've identified the CPU. If we haven't,
+ * rather than a compile-time error, leave it undefined, rather than faulting
+ * with an immediate #error. This leaves the door open to non-ARM
+ * builds with or people providing substitutes for other CPUs, and only if
+ * needed.
+ */
+#ifdef LOOP_SCALER
+
+// *INDENT-OFF*
+#ifdef __CC_ARM /* ARMC5 */
+__asm static void delay_loop(uint32_t count)
+{
+1
+  SUBS a1, a1, #1
+  NOP
+  NOP
+  BCS  %BT1
+  BX   lr
+}
+#elif defined (__ICCARM__)
+static void delay_loop(uint32_t count)
+{
+  __asm volatile(
+    "loop: \n"
+    " SUBS %0, %0, #1 \n"
+    " NOP\n"
+    " NOP\n"
+    " BCS.n  loop\n"
+    : "+r" (count)
+    :
+    : "cc"
+  );
+}
+#else // GCC or ARMC6
+static void delay_loop(uint32_t count)
+{
+  __asm__ volatile (
+    "%=:\n\t"
+/* Only GCC insists on non-UAL assembly for Thumb v1 */
+#if !defined(__ARMCC_VERSION) && defined(__thumb__) && !defined(__thumb2__)
+    "SUB  %0, #1\n\t"
+#else
+    "SUBS %0, %0, #1\n\t"
+#endif
+    "NOP\n\t"
+    "NOP\n\t"
+    "BCS  %=b\n\t"
+    : "+l" (count)
+    :
+    : "cc"
+  );
+}
+#endif
+// *INDENT-ON*
+
+void wait_ns(unsigned int ns)
+{
+    uint32_t cycles_per_us = SystemCoreClock / 1000000;
+    // Note that this very calculation, plus call overhead, will take multiple
+    // cycles. Could well be 100ns on its own... So round down here, startup is
+    // worth at least one loop iteration.
+    uint32_t count = (cycles_per_us * ns) / LOOP_SCALER;
+
+    delay_loop(count);
+}
+#endif // LOOP_SCALER