samd: neopixel: Fix neopixels after #2297

This adapts the "inline assembler" code from the UF2 bootloader, which
in turn is said to be adapted from the arduino neopixel library.

This requires the cache remain ON when using M0, and be turned OFF on M4
(determined by trial and error)

Testing performed on a Metro M4:
 * measured timings using o'scope and found all values within
   datasheet tolerance.
 * Drove a string of 96 neopixels without visible glitches
 * on-board neopixel worked

Testing performed on a Circuit Playground Express (M0):
 * Color wheel code works on built-in neopixels
 * Color wheel code works on 96 neopixel strip

As a bonus, this may have freed up a bit of flash on M0 targets. (2988 ->
3068 bytes free on Trinket M0)

Closes: #2297
This commit is contained in:
Jeff Epler 2019-12-07 19:06:17 -06:00
parent 15886b1505
commit 1905d0746d

View File

@ -34,24 +34,62 @@
#ifdef SAMD51
#include "hri/hri_cmcc_d51.h"
#include "hri/hri_nvmctrl_d51.h"
// This magical macro makes sure the delay isn't optimized out and is the
// minimal three instructions.
#define delay_cycles(cycles) \
{ \
uint32_t t; \
asm volatile ( \
"movs %[t], %[c]\n\t" \
"loop%=:\n\t" \
"subs %[t], #1\n\t" \
"bne.n loop%=" : [t] "=r"(t) : [c] "I" (cycles)); \
}
#endif
// Ensure this code is compiled with -Os. Any other optimization level may change the timing of it
// and break neopixels.
#pragma GCC push_options
#pragma GCC optimize ("Os")
__attribute__((naked,noinline,aligned(16)))
static void neopixel_send_buffer_core(volatile uint32_t *clraddr, uint32_t pinMask,
const uint8_t *ptr, int numBytes);
static void neopixel_send_buffer_core(volatile uint32_t *clraddr, uint32_t pinMask,
const uint8_t *ptr, int numBytes) {
asm volatile(" push {r4, r5, r6, lr};"
" add r3, r2, r3;"
"loopLoad:"
" ldrb r5, [r2, #0];" // r5 := *ptr
" add r2, #1;" // ptr++
" movs r4, #128;" // r4-mask, 0x80
"loopBit:"
" str r1, [r0, #4];" // set
#ifdef SAMD21
" movs r6, #3; d2: sub r6, #1; bne d2;" // delay 3
#endif
#ifdef SAMD51
" movs r6, #3; d2: subs r6, #1; bne d2;" // delay 3
#endif
" tst r4, r5;" // mask&r5
" bne skipclr;"
" str r1, [r0, #0];" // clr
"skipclr:"
#ifdef SAMD21
" movs r6, #6; d0: sub r6, #1; bne d0;" // delay 6
#endif
#ifdef SAMD51
" movs r6, #6; d0: subs r6, #1; bne d0;" // delay 6
#endif
" str r1, [r0, #0];" // clr (possibly again, doesn't matter)
#ifdef SAMD21
" asr r4, r4, #1;" // mask >>= 1
#endif
#ifdef SAMD51
" asrs r4, r4, #1;" // mask >>= 1
#endif
" beq nextbyte;"
" uxtb r4, r4;"
#ifdef SAMD21
" movs r6, #2; d1: sub r6, #1; bne d1;" // delay 2
#endif
#ifdef SAMD51
" movs r6, #2; d1: subs r6, #1; bne d1;" // delay 2
#endif
" b loopBit;"
"nextbyte:"
" cmp r2, r3;"
" bcs neopixel_stop;"
" b loopLoad;"
"neopixel_stop:"
" pop {r4, r5, r6, pc};"
"");
}
uint64_t next_start_tick_ms = 0;
uint32_t next_start_tick_us = 1000;
@ -59,7 +97,7 @@ uint32_t next_start_tick_us = 1000;
void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout, uint8_t *pixels, uint32_t numBytes) {
// This is adapted directly from the Adafruit NeoPixel library SAMD21G18A code:
// https://github.com/adafruit/Adafruit_NeoPixel/blob/master/Adafruit_NeoPixel.cpp
uint8_t *ptr, *end, p, bitMask;
// and the asm version from https://github.com/microsoft/uf2-samdx1/blob/master/inc/neopixel.h
uint32_t pinMask;
PortGroup* port;
@ -71,21 +109,18 @@ void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout,
mp_hal_disable_all_interrupts();
#ifdef SAMD21
// Make sure the NVM cache is consistently timed.
NVMCTRL->CTRLB.bit.READMODE = NVMCTRL_CTRLB_READMODE_DETERMINISTIC_Val;
#endif
#ifdef SAMD51
// When this routine is positioned at certain addresses, the timing logic
// below can be too fast by about 2.5x. This is some kind of (un)fortunate code
// positiong with respect to a cache line.
// positioning with respect to a cache line.
// Theoretically we should turn on off the CMCC caches and the
// NVM caches to ensure consistent timing. Testing shows the the NVMCTRL
// cache disabling seems to make the difference. But turn both off to make sure.
// It's difficult to test because additions to the code before the timing loop
// below change instruction placement. Testing was done by adding cache changes
// below the loop (so only the first time through is wrong).
// below change instruction placement. (though this should be less true now that
// the main code is in the cache-aligned function neopixel_send_buffer_core)
// Testing was done by adding cache changes below the loop (so only the
// first time through is wrong).
//
// Turn off instruction, data, and NVM caches to force consistent timing.
// Invalidate existing cache entries.
@ -93,78 +128,13 @@ void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout,
hri_cmcc_write_MAINT0_reg(CMCC, CMCC_MAINT0_INVALL);
hri_nvmctrl_set_CTRLA_CACHEDIS0_bit(NVMCTRL);
hri_nvmctrl_set_CTRLA_CACHEDIS1_bit(NVMCTRL);
#endif
#endif
uint32_t pin = digitalinout->pin->number;
port = &PORT->Group[GPIO_PORT(pin)]; // Convert GPIO # to port register
pinMask = (1UL << (pin % 32)); // From port_pin_set_output_level ASF code.
ptr = pixels;
end = ptr + numBytes;
p = *ptr++;
bitMask = 0x80;
volatile uint32_t *set = &(port->OUTSET.reg),
*clr = &(port->OUTCLR.reg);
for(;;) {
*set = pinMask;
// This is the time where the line is always high regardless of the bit.
// For the SK6812 its 0.3us +- 0.15us
#ifdef SAMD21
asm("nop; nop;");
#endif
#ifdef SAMD51
delay_cycles(2);
#endif
if((p & bitMask) != 0) {
// This is the high delay unique to a one bit.
// For the SK6812 its 0.3us
#ifdef SAMD21
asm("nop; nop; nop; nop; nop; nop; nop;");
#endif
#ifdef SAMD51
delay_cycles(3);
#endif
*clr = pinMask;
} else {
*clr = pinMask;
// This is the low delay unique to a zero bit.
// For the SK6812 its 0.3us
#ifdef SAMD21
asm("nop; nop;");
#endif
#ifdef SAMD51
delay_cycles(2);
#endif
}
if((bitMask >>= 1) != 0) {
// This is the delay between bits in a byte and is the 1 code low
// level time from the datasheet.
// For the SK6812 its 0.6us +- 0.15us
#ifdef SAMD21
asm("nop; nop; nop; nop; nop;");
#endif
#ifdef SAMD51
delay_cycles(4);
#endif
} else {
if(ptr >= end) break;
p = *ptr++;
bitMask = 0x80;
// This is the delay between bytes. It's similar to the other branch
// in the if statement except its tuned to account for the time the
// above operations take.
// For the SK6812 its 0.6us +- 0.15us
#ifdef SAMD51
delay_cycles(3);
#endif
}
}
#ifdef SAMD21
// Speed up! (But inconsistent timing.)
NVMCTRL->CTRLB.bit.READMODE = NVMCTRL_CTRLB_READMODE_NO_MISS_PENALTY_Val;
#endif
volatile uint32_t *clr = &(port->OUTCLR.reg);
neopixel_send_buffer_core(clr, pinMask, pixels, numBytes);
#ifdef SAMD51
// Turn instruction, data, and NVM caches back on.
@ -189,4 +159,3 @@ void common_hal_neopixel_write(const digitalio_digitalinout_obj_t* digitalinout,
}
#pragma GCC pop_options