Merge pull request #2526 from jepler/mixer-opt

Mixer: Rework for performance, particularly of the S16 case
This commit is contained in:
sommersoft 2020-01-24 21:46:47 -06:00 committed by GitHub
commit c2fd30364e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 141 additions and 260 deletions

View File

@ -101,198 +101,141 @@ void audiomixer_mixer_reset_buffer(audiomixer_mixer_obj_t* self,
} }
} }
uint32_t add8signed(uint32_t a, uint32_t b) { __attribute__((always_inline))
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU static inline uint32_t add16signed(uint32_t a, uint32_t b) {
return __SHADD8(a, b); return __QADD16(a, b);
#else
uint32_t result = 0;
for (int8_t i = 0; i < 4; i++) {
int8_t ai = a >> (sizeof(int8_t) * 8 * i);
int8_t bi = b >> (sizeof(int8_t) * 8 * i);
int32_t intermediate = (int32_t) ai + bi / 2;
if (intermediate > CHAR_MAX) {
intermediate = CHAR_MAX;
} else if (intermediate < CHAR_MIN) {
intermediate = CHAR_MIN;
}
result |= ((uint32_t) intermediate & 0xff) << (sizeof(int8_t) * 8 * i);
}
return result;
#endif
}
uint32_t add8unsigned(uint32_t a, uint32_t b) {
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
return __UHADD8(a, b);
#else
uint32_t result = 0;
for (int8_t i = 0; i < 4; i++) {
uint8_t ai = (a >> (sizeof(uint8_t) * 8 * i));
uint8_t bi = (b >> (sizeof(uint8_t) * 8 * i));
int32_t intermediate = (int32_t) (ai + bi) / 2;
if (intermediate > UCHAR_MAX) {
intermediate = UCHAR_MAX;
}
result |= ((uint32_t) intermediate & 0xff) << (sizeof(uint8_t) * 8 * i);
}
return result;
#endif
}
uint32_t add16signed(uint32_t a, uint32_t b) {
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
return __SHADD16(a, b);
#else
uint32_t result = 0;
for (int8_t i = 0; i < 2; i++) {
int16_t ai = a >> (sizeof(int16_t) * 8 * i);
int16_t bi = b >> (sizeof(int16_t) * 8 * i);
int32_t intermediate = (int32_t) ai + bi / 2;
if (intermediate > SHRT_MAX) {
intermediate = SHRT_MAX;
} else if (intermediate < SHRT_MIN) {
intermediate = SHRT_MIN;
}
result |= (((uint32_t) intermediate) & 0xffff) << (sizeof(int16_t) * 8 * i);
}
return result;
#endif
}
uint32_t add16unsigned(uint32_t a, uint32_t b) {
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
return __UHADD16(a, b);
#else
uint32_t result = 0;
for (int8_t i = 0; i < 2; i++) {
int16_t ai = (a >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
int16_t bi = (b >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
int32_t intermediate = (int32_t) ai + bi / 2;
if (intermediate > USHRT_MAX) {
intermediate = USHRT_MAX;
}
result |= ((uint32_t) intermediate & 0xffff) << (sizeof(int16_t) * 8 * i);
}
return result;
#endif
}
static inline uint32_t mult8unsigned(uint32_t val, int32_t mul) {
// if mul == 0, no need in wasting cycles
if (mul == 0) {
return 0;
}
/* TODO: workout ARMv7 instructions
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
return val;
#else*/
uint32_t result = 0;
float mod_mul = (float) mul / (float) ((1<<15)-1);
for (int8_t i = 0; i < 4; i++) {
uint8_t ai = val >> (sizeof(uint8_t) * 8 * i);
int32_t intermediate = ai * mod_mul;
if (intermediate > SHRT_MAX) {
intermediate = SHRT_MAX;
}
result |= ((uint32_t) intermediate & 0xff) << (sizeof(uint8_t) * 8 * i);
}
return result;
//#endif
}
static inline uint32_t mult8signed(uint32_t val, int32_t mul) {
// if mul == 0, no need in wasting cycles
if (mul == 0) {
return 0;
}
/* TODO: workout ARMv7 instructions
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
return val;
#else
*/
uint32_t result = 0;
float mod_mul = (float)mul / (float)((1<<15)-1);
for (int8_t i = 0; i < 4; i++) {
int16_t ai = val >> (sizeof(int8_t) * 8 * i);
int32_t intermediate = ai * mod_mul;
if (intermediate > CHAR_MAX) {
intermediate = CHAR_MAX;
} else if (intermediate < CHAR_MIN) {
intermediate = CHAR_MIN;
}
result |= (((uint32_t) intermediate) & 0xff) << (sizeof(int16_t) * 8 * i);
}
return result;
//#endif
}
//TODO:
static inline uint32_t mult16unsigned(uint32_t val, int32_t mul) {
// if mul == 0, no need in wasting cycles
if (mul == 0) {
return 0;
}
/* TODO: the below ARMv7m instructions "work", but the amplitude is much higher/louder
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
// there is no unsigned equivalent to the 'SMULWx' ARMv7 Thumb function,
// so we have to do it by hand.
uint32_t lo = val & 0xffff;
uint32_t hi = val >> 16;
//mp_printf(&mp_plat_print, "pre-asm: (mul: %d)\n\tval: %x\tlo: %x\thi: %x\n", mul, val, lo, hi);
uint32_t val_lo;
asm volatile("mul %0, %1, %2" : "=r" (val_lo) : "r" (mul), "r" (lo));
asm volatile("mla %0, %1, %2, %3" : "=r" (val) : "r" (mul), "r" (hi), "r" (val_lo));
//mp_printf(&mp_plat_print, "post-asm:\n\tval: %x\tlo: %x\n\n", val, val_lo);
return val;
#else
*/
uint32_t result = 0;
float mod_mul = (float)mul / (float)((1<<15)-1);
for (int8_t i = 0; i < 2; i++) {
int16_t ai = (val >> (sizeof(uint16_t) * 8 * i)) - 0x8000;
int32_t intermediate = ai * mod_mul;
if (intermediate > SHRT_MAX) {
intermediate = SHRT_MAX;
} else if (intermediate < SHRT_MIN) {
intermediate = SHRT_MIN;
}
result |= (((uint32_t) intermediate) + 0x8000) << (sizeof(int16_t) * 8 * i);
}
return result;
//#endif
} }
__attribute__((always_inline))
static inline uint32_t mult16signed(uint32_t val, int32_t mul) { static inline uint32_t mult16signed(uint32_t val, int32_t mul) {
// if mul == 0, no need in wasting cycles mul <<= 16;
if (mul == 0) {
return 0;
}
#if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
int32_t hi, lo; int32_t hi, lo;
enum { bits = 16 }; // saturate to 16 bits enum { bits = 16 }; // saturate to 16 bits
enum { shift = 0 }; // shift is done automatically enum { shift = 15 }; // shift is done automatically
asm volatile("smulwb %0, %1, %2" : "=r" (lo) : "r" (mul), "r" (val)); asm volatile("smulwb %0, %1, %2" : "=r" (lo) : "r" (mul), "r" (val));
asm volatile("smulwt %0, %1, %2" : "=r" (hi) : "r" (mul), "r" (val)); asm volatile("smulwt %0, %1, %2" : "=r" (hi) : "r" (mul), "r" (val));
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (lo) : "I" (bits), "r" (lo), "I" (shift)); asm volatile("ssat %0, %1, %2, asr %3" : "=r" (lo) : "I" (bits), "r" (lo), "I" (shift));
asm volatile("ssat %0, %1, %2, asr %3" : "=r" (hi) : "I" (bits), "r" (hi), "I" (shift)); asm volatile("ssat %0, %1, %2, asr %3" : "=r" (hi) : "I" (bits), "r" (hi), "I" (shift));
asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (val) : "r" (lo), "r" (hi)); // pack asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (val) : "r" (lo), "r" (hi)); // pack
return val; return val;
#else
uint32_t result = 0;
float mod_mul = (float)mul / (float)((1<<15)-1);
for (int8_t i = 0; i < 2; i++) {
int16_t ai = val >> (sizeof(int16_t) * 8 * i);
int32_t intermediate = ai * mod_mul;
if (intermediate > SHRT_MAX) {
intermediate = SHRT_MAX;
} else if (intermediate < SHRT_MIN) {
intermediate = SHRT_MIN;
} }
result |= (((uint32_t) intermediate) & 0xffff) << (sizeof(int16_t) * 8 * i);
static inline uint32_t tounsigned8(uint32_t val) {
return __UADD8(val, 0x80808080);
}
static inline uint32_t tounsigned16(uint32_t val) {
return __UADD16(val, 0x80008000);
}
static inline uint32_t tosigned16(uint32_t val) {
return __UADD16(val, 0x80008000);
}
static inline uint32_t unpack8(uint16_t val) {
return ((val & 0xff00) << 16) | ((val & 0x00ff) << 8);
}
static inline uint32_t pack8(uint32_t val) {
return ((val & 0xff000000) >> 16) | ((val & 0xff00) >> 8);
}
static void mix_down_one_voice(audiomixer_mixer_obj_t* self,
audiomixer_mixervoice_obj_t* voice, bool voices_active,
uint32_t* word_buffer, uint32_t length) {
bool voice_done = voice->sample == NULL;
while (!voice_done && length != 0) {
if (voice->buffer_length == 0) {
if (!voice->more_data) {
if (voice->loop) {
audiosample_reset_buffer(voice->sample, false, 0);
} else {
voice->sample = NULL;
voice_done = true;
break;
}
}
if (!voice_done) {
// Load another buffer
audioio_get_buffer_result_t result = audiosample_get_buffer(voice->sample, false, 0, (uint8_t**) &voice->remaining_buffer, &voice->buffer_length);
// Track length in terms of words.
voice->buffer_length /= sizeof(uint32_t);
voice->more_data = result == GET_BUFFER_MORE_DATA;
}
}
uint32_t n = MIN(voice->buffer_length, length);
uint32_t *src = voice->remaining_buffer;
uint16_t level = voice->level;
// First active voice gets copied over verbatim.
if (!voices_active) {
if (MP_LIKELY(self->bits_per_sample == 16)) {
if (MP_LIKELY(self->samples_signed)) {
for (uint32_t i = 0; i<n; i++) {
uint32_t v = src[i];
word_buffer[i] = mult16signed(v, level);
}
} else {
for (uint32_t i = 0; i<n; i++) {
uint32_t v = src[i];
v = tosigned16(v);
word_buffer[i] = mult16signed(v, level);
}
}
} else {
uint16_t *hword_buffer = (uint16_t*)word_buffer;
uint16_t *hsrc = (uint16_t*)src;
for (uint32_t i = 0; i<n*2; i++) {
uint32_t word = unpack8(hsrc[i]);
if (MP_LIKELY(!self->samples_signed)) {
word = tosigned16(word);
}
word = mult16signed(word, level);
hword_buffer[i] = pack8(word);
}
}
} else {
if (MP_LIKELY(self->bits_per_sample == 16)) {
if (MP_LIKELY(self->samples_signed)) {
for (uint32_t i = 0; i<n; i++) {
uint32_t word = src[i];
word_buffer[i] = add16signed(mult16signed(word, level), word_buffer[i]);
}
} else {
for (uint32_t i = 0; i<n; i++) {
uint32_t word = src[i];
word = tosigned16(word);
word_buffer[i] = add16signed(mult16signed(word, level), word_buffer[i]);
}
}
} else {
uint16_t *hword_buffer = (uint16_t*)word_buffer;
uint16_t *hsrc = (uint16_t*)src;
for (uint32_t i = 0; i<n*2; i++) {
uint32_t word = unpack8(hsrc[i]);
if (MP_LIKELY(!self->samples_signed)) {
word = tosigned16(word);
}
word = mult16signed(word, level);
word = add16signed(word, unpack8(hword_buffer[i]));
hword_buffer[i] = pack8(word);
}
}
}
length -= n;
word_buffer += n;
voice->remaining_buffer += n;
voice->buffer_length -= n;
}
if (length && !voices_active) {
uint32_t sample_value = self->bits_per_sample == 8
? 0x80808080 : 0x80008000;
for (uint32_t i = 0; i<length; i++) {
word_buffer[i] = sample_value;
}
} }
return result;
#endif
} }
audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t* self, audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t* self,
@ -322,89 +265,27 @@ audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t*
} }
self->use_first_buffer = !self->use_first_buffer; self->use_first_buffer = !self->use_first_buffer;
bool voices_active = false; bool voices_active = false;
uint32_t length = self->len / sizeof(uint32_t);
for (int32_t v = 0; v < self->voice_count; v++) { for (int32_t v = 0; v < self->voice_count; v++) {
audiomixer_mixervoice_obj_t* voice = MP_OBJ_TO_PTR(self->voice[v]); audiomixer_mixervoice_obj_t* voice = MP_OBJ_TO_PTR(self->voice[v]);
uint32_t j = 0; mix_down_one_voice(self, voice, voices_active, word_buffer, length);
bool voice_done = voice->sample == NULL;
for (uint32_t i = 0; i < self->len / sizeof(uint32_t); i++) {
if (!voice_done && j >= voice->buffer_length) {
if (!voice->more_data) {
if (voice->loop) {
audiosample_reset_buffer(voice->sample, false, 0);
} else {
voice->sample = NULL;
voice_done = true;
}
}
if (!voice_done) {
// Load another buffer
audioio_get_buffer_result_t result = audiosample_get_buffer(voice->sample, false, 0, (uint8_t**) &voice->remaining_buffer, &voice->buffer_length);
// Track length in terms of words.
voice->buffer_length /= sizeof(uint32_t);
voice->more_data = result == GET_BUFFER_MORE_DATA;
j = 0;
}
}
// First active voice gets copied over verbatim.
uint32_t sample_value;
if (voice_done) {
// Exit early if another voice already set all samples once.
if (voices_active) {
continue;
}
sample_value = 0;
if (!self->samples_signed) {
if (self->bits_per_sample == 8) {
sample_value = 0x7f7f7f7f;
} else {
sample_value = 0x7fff7fff;
}
}
} else {
sample_value = voice->remaining_buffer[j];
}
// apply the mixer level
if (!self->samples_signed) {
if (self->bits_per_sample == 8) {
sample_value = mult8unsigned(sample_value, voice->level);
} else {
sample_value = mult16unsigned(sample_value, voice->level);
}
} else {
if (self->bits_per_sample == 8) {
sample_value = mult8signed(sample_value, voice->level);
} else {
sample_value = mult16signed(sample_value, voice->level);
}
}
if (!voices_active) {
word_buffer[i] = sample_value;
} else {
if (self->bits_per_sample == 8) {
if (self->samples_signed) {
word_buffer[i] = add8signed(word_buffer[i], sample_value);
} else {
word_buffer[i] = add8unsigned(word_buffer[i], sample_value);
}
} else {
if (self->samples_signed) {
word_buffer[i] = add16signed(word_buffer[i], sample_value);
} else {
word_buffer[i] = add16unsigned(word_buffer[i], sample_value);
}
}
}
j++;
}
voice->buffer_length -= j;
voice->remaining_buffer += j;
voices_active = true; voices_active = true;
} }
if (!self->samples_signed) {
if (self->bits_per_sample == 16) {
for (uint32_t i = 0; i < length; i++) {
word_buffer[i] = tounsigned16(word_buffer[i]);
}
} else {
for (uint32_t i = 0; i < length; i++) {
word_buffer[i] = tounsigned8(word_buffer[i]);
}
}
}
self->read_count += 1; self->read_count += 1;
} else if (!self->use_first_buffer) { } else if (!self->use_first_buffer) {
*buffer = (uint8_t*) self->first_buffer; *buffer = (uint8_t*) self->first_buffer;

View File

@ -34,7 +34,7 @@
void common_hal_audiomixer_mixervoice_construct(audiomixer_mixervoice_obj_t *self) { void common_hal_audiomixer_mixervoice_construct(audiomixer_mixervoice_obj_t *self) {
self->sample = NULL; self->sample = NULL;
self->level = ((1 << 15) - 1); self->level = 1 << 15;
} }
void common_hal_audiomixer_mixervoice_set_parent(audiomixer_mixervoice_obj_t* self, audiomixer_mixer_obj_t *parent) { void common_hal_audiomixer_mixervoice_set_parent(audiomixer_mixervoice_obj_t* self, audiomixer_mixer_obj_t *parent) {
@ -42,11 +42,11 @@ void common_hal_audiomixer_mixervoice_set_parent(audiomixer_mixervoice_obj_t* se
} }
float common_hal_audiomixer_mixervoice_get_level(audiomixer_mixervoice_obj_t* self) { float common_hal_audiomixer_mixervoice_get_level(audiomixer_mixervoice_obj_t* self) {
return ((float) self->level / ((1 << 15) - 1)); return ((float) self->level / (1 << 15));
} }
void common_hal_audiomixer_mixervoice_set_level(audiomixer_mixervoice_obj_t* self, float level) { void common_hal_audiomixer_mixervoice_set_level(audiomixer_mixervoice_obj_t* self, float level) {
self->level = level * ((1 << 15)-1); self->level = level * (1 << 15);
} }
void common_hal_audiomixer_mixervoice_play(audiomixer_mixervoice_obj_t* self, mp_obj_t sample, bool loop) { void common_hal_audiomixer_mixervoice_play(audiomixer_mixervoice_obj_t* self, mp_obj_t sample, bool loop) {

View File

@ -39,7 +39,7 @@ typedef struct {
bool more_data; bool more_data;
uint32_t* remaining_buffer; uint32_t* remaining_buffer;
uint32_t buffer_length; uint32_t buffer_length;
int16_t level; uint16_t level;
} audiomixer_mixervoice_obj_t; } audiomixer_mixervoice_obj_t;