samd: neopixel: Fix neopixels after #2297

jepler · jepler · commit 1905d0746dbd · 2019-12-07T19:19:45.000-06:00
This adapts the "inline assembler" code from the UF2 bootloader, which in turn is said to be adapted from the arduino neopixel library. This requires the cache remain ON when using M0, and be turned OFF on M4 (determined by trial and error) Testing performed on a Metro M4: * measured timings using o'scope and found all values within datasheet tolerance. * Drove a string of 96 neopixels without visible glitches * on-board neopixel worked Testing performed on a Circuit Playground Express (M0): * Color wheel code works on built-in neopixels * Color wheel code works on 96 neopixel strip As a bonus, this may have freed up a bit of flash on M0 targets. (2988 -> 3068 bytes free on Trinket M0) Closes: #2297
diff --git a/ports/atmel-samd/common-hal/neopixel_write/__init__.c b/ports/atmel-samd/common-hal/neopixel_write/__init__.c
@@ -34,32 +34,70 @@
 #ifdef SAMD51
 #include "hri/hri_cmcc_d51.h"
 #include "hri/hri_nvmctrl_d51.h"
-
-// This magical macro makes sure the delay isn't optimized out and is the
-// minimal three instructions.
-#define delay_cycles(cycles) \
-{ \
-    uint32_t t; \
-    asm volatile ( \
-        "movs %[t], %[c]\n\t" \
-        "loop%=:\n\t" \
-        "subs	%[t], #1\n\t" \
-        "bne.n  loop%=" : [t] "=r"(t) : [c] "I" (cycles)); \
-    }
 #endif
 
-// Ensure this code is compiled with -Os. Any other optimization level may change the timing of it
-// and break neopixels.
-#pragma GCC push_options
-#pragma GCC optimize ("Os")
+__attribute__((naked,noinline,aligned(16)))
+static void neopixel_send_buffer_core(volatile uint32_t *clraddr, uint32_t pinMask,
+                                      const uint8_t *ptr, int numBytes);
+
+static void neopixel_send_buffer_core(volatile uint32_t *clraddr, uint32_t pinMask,
+                                      const uint8_t *ptr, int numBytes) {
+    asm volatile("        push    {r4, r5, r6, lr};"
+                 "        add     r3, r2, r3;"
+                 "loopLoad:"
+                 "        ldrb r5, [r2, #0];" // r5 := *ptr
+                 "        add  r2, #1;"       // ptr++
+                 "        movs    r4, #128;"  // r4-mask, 0x80
+                 "loopBit:"
+                 "        str r1, [r0, #4];"                    // set
+                 #ifdef SAMD21
+                 "        movs r6, #3; d2: sub r6, #1; bne d2;" // delay 3
+                 #endif
+                 #ifdef SAMD51
+                 "        movs r6, #3; d2: subs r6, #1; bne d2;" // delay 3
+                 #endif
+                 "        tst r4, r5;"                          // mask&r5
+                 "        bne skipclr;"
+                 "        str r1, [r0, #0];" // clr
+                 "skipclr:"
+                 #ifdef SAMD21
+                 "        movs r6, #6; d0: sub r6, #1; bne d0;" // delay 6
+                 #endif
+                 #ifdef SAMD51
+                 "        movs r6, #6; d0: subs r6, #1; bne d0;" // delay 6
+                 #endif
+                 "        str r1, [r0, #0];"   // clr (possibly again, doesn't matter)
+                 #ifdef SAMD21
+                 "        asr     r4, r4, #1;" // mask >>= 1
+                 #endif
+                 #ifdef SAMD51
+                 "        asrs     r4, r4, #1;" // mask >>= 1
+                 #endif
+                 "        beq     nextbyte;"
+                 "        uxtb    r4, r4;"
+                 #ifdef SAMD21
+                 "        movs r6, #2; d1: sub r6, #1; bne d1;" // delay 2
+                 #endif
+                 #ifdef SAMD51
+                 "        movs r6, #2; d1: subs r6, #1; bne d1;" // delay 2
+                 #endif
+                 "        b       loopBit;"
+                 "nextbyte:"
+                 "        cmp r2, r3;"
+                 "        bcs neopixel_stop;"
+                 "        b loopLoad;"
+                 "neopixel_stop:"
+                 "        pop {r4, r5, r6, pc};"
+                 "");
+}
 
 uint64_t next_start_tick_ms = 0;
 uint32_t next_start_tick_us = 1000;
 
 void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout, uint8_t *pixels, uint32_t numBytes) {
     // This is adapted directly from the Adafruit NeoPixel library SAMD21G18A code:
     // https://github.com/adafruit/Adafruit_NeoPixel/blob/master/Adafruit_NeoPixel.cpp
-    uint8_t  *ptr, *end, p, bitMask;
+    // and the asm version from https://github.com/microsoft/uf2-samdx1/blob/master/inc/neopixel.h
     uint32_t  pinMask;
     PortGroup* port;
 
@@ -71,100 +109,32 @@ void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout,
     mp_hal_disable_all_interrupts();
 
 
-    #ifdef SAMD21
-    // Make sure the NVM cache is consistently timed.
-    NVMCTRL->CTRLB.bit.READMODE = NVMCTRL_CTRLB_READMODE_DETERMINISTIC_Val;
-    #endif
-
     #ifdef SAMD51
     // When this routine is positioned at certain addresses, the timing logic
     // below can be too fast by about 2.5x. This is some kind of (un)fortunate code
-    // positiong with respect to a cache line.
+    // positioning with respect to a cache line.
     // Theoretically we should turn on off the CMCC caches and the
     // NVM caches to ensure consistent timing. Testing shows the the NVMCTRL
     // cache disabling seems to make the difference. But turn both off to make sure.
     // It's difficult to test because additions to the code before the timing loop
-    // below change instruction placement. Testing was done by adding cache changes
-    // below the loop (so only the first time through is wrong).
+    // below change instruction placement. (though this should be less true now that
+    // the main code is in the cache-aligned function neopixel_send_buffer_core)
+    // Testing was done by adding cache changes below the loop (so only the
+    // first time through is wrong).
     //
     // Turn off instruction, data, and NVM caches to force consistent timing.
     // Invalidate existing cache entries.
     hri_cmcc_set_CFG_reg(CMCC, CMCC_CFG_DCDIS | CMCC_CFG_ICDIS);
     hri_cmcc_write_MAINT0_reg(CMCC, CMCC_MAINT0_INVALL);
     hri_nvmctrl_set_CTRLA_CACHEDIS0_bit(NVMCTRL);
     hri_nvmctrl_set_CTRLA_CACHEDIS1_bit(NVMCTRL);
-   #endif
+    #endif
 
     uint32_t pin = digitalinout->pin->number;
     port    =  &PORT->Group[GPIO_PORT(pin)];  // Convert GPIO # to port register
     pinMask =  (1UL << (pin % 32));  // From port_pin_set_output_level ASF code.
-    ptr     =  pixels;
-    end     =  ptr + numBytes;
-    p       = *ptr++;
-    bitMask =  0x80;
-
-    volatile uint32_t *set = &(port->OUTSET.reg),
-                      *clr = &(port->OUTCLR.reg);
-
-    for(;;) {
-        *set = pinMask;
-        // This is the time where the line is always high regardless of the bit.
-        // For the SK6812 its 0.3us +- 0.15us
-        #ifdef SAMD21
-        asm("nop; nop;");
-        #endif
-        #ifdef SAMD51
-        delay_cycles(2);
-        #endif
-        if((p & bitMask) != 0) {
-            // This is the high delay unique to a one bit.
-            // For the SK6812 its 0.3us
-            #ifdef SAMD21
-            asm("nop; nop; nop; nop; nop; nop; nop;");
-            #endif
-            #ifdef SAMD51
-            delay_cycles(3);
-            #endif
-            *clr = pinMask;
-        } else {
-            *clr = pinMask;
-            // This is the low delay unique to a zero bit.
-            // For the SK6812 its 0.3us
-            #ifdef SAMD21
-            asm("nop; nop;");
-            #endif
-            #ifdef SAMD51
-            delay_cycles(2);
-            #endif
-        }
-        if((bitMask >>= 1) != 0) {
-            // This is the delay between bits in a byte and is the 1 code low
-            // level time from the datasheet.
-            // For the SK6812 its 0.6us +- 0.15us
-            #ifdef SAMD21
-            asm("nop; nop; nop; nop; nop;");
-            #endif
-            #ifdef SAMD51
-            delay_cycles(4);
-            #endif
-        } else {
-            if(ptr >= end) break;
-            p       = *ptr++;
-            bitMask = 0x80;
-            // This is the delay between bytes. It's similar to the other branch
-            // in the if statement except its tuned to account for the time the
-            // above operations take.
-            // For the SK6812 its 0.6us +- 0.15us
-            #ifdef SAMD51
-            delay_cycles(3);
-            #endif
-        }
-    }
-
-    #ifdef SAMD21
-    // Speed up! (But inconsistent timing.)
-    NVMCTRL->CTRLB.bit.READMODE = NVMCTRL_CTRLB_READMODE_NO_MISS_PENALTY_Val;
-    #endif
+    volatile uint32_t *clr = &(port->OUTCLR.reg);
+    neopixel_send_buffer_core(clr, pinMask, pixels, numBytes);
 
     #ifdef SAMD51
     // Turn instruction, data, and NVM caches back on.
@@ -189,4 +159,3 @@ void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout,
 
 }
 
-#pragma GCC pop_options