From c284728621fa28d4ef318d98db2b11cc95417e6a Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Thu, 18 Feb 2021 11:33:13 -0600 Subject: [PATCH] bit_transpose: Support from 2 to 7 strands, not just 8 --- locale/circuitpython.pot | 10 ++- shared-bindings/_bit_transpose/__init__.c | 26 +++++--- shared-bindings/_bit_transpose/__init__.h | 2 +- shared-module/_bit_transpose/__init__.c | 78 +++++++++++++++++++++-- 4 files changed, 99 insertions(+), 17 deletions(-) diff --git a/locale/circuitpython.pot b/locale/circuitpython.pot index 11335fd325..eef1b07f29 100644 --- a/locale/circuitpython.pot +++ b/locale/circuitpython.pot @@ -1093,7 +1093,8 @@ msgid "Initialization failed due to lack of memory" msgstr "" #: shared-bindings/_bit_transpose/__init__.c -msgid "Input buffer must be a multiple of 8 bytes" +#, c-format +msgid "Input buffer length (%d) must be a multiple of the strand count (%d)" msgstr "" #: ports/atmel-samd/common-hal/pulseio/PulseIn.c @@ -1664,7 +1665,8 @@ msgid "Out of sockets" msgstr "" #: shared-bindings/_bit_transpose/__init__.c -msgid "Output buffer must be at least as big as input buffer" +#, c-format +msgid "Output buffer must be at least %d bytes" msgstr "" #: shared-bindings/audiobusio/PDMIn.c @@ -3473,6 +3475,10 @@ msgstr "" msgid "not enough arguments for format string" msgstr "" +#: shared-bindings/_bit_transpose/__init__.c +msgid "num_strands must be from 2 to 8 (inclusive)" +msgstr "" + #: extmod/ulab/code/ulab_create.c msgid "number of points must be at least 2" msgstr "" diff --git a/shared-bindings/_bit_transpose/__init__.c b/shared-bindings/_bit_transpose/__init__.c index 38f66eb778..979dff1452 100644 --- a/shared-bindings/_bit_transpose/__init__.c +++ b/shared-bindings/_bit_transpose/__init__.c @@ -43,9 +43,10 @@ //| ... //| STATIC mp_obj_t bit_transpose(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) { - enum { ARG_input, ARG_output }; + enum { ARG_input, ARG_num_strands, ARG_output }; static const mp_arg_t allowed_args[] = { { MP_QSTR_input, MP_ARG_OBJ | MP_ARG_REQUIRED, {} }, + { MP_QSTR_num_strands, MP_ARG_INT | MP_ARG_KW_ONLY, { .u_int = 8 } }, { MP_QSTR_output, MP_ARG_OBJ | MP_ARG_KW_ONLY, { .u_obj = mp_const_none } }, }; mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)]; @@ -55,21 +56,28 @@ STATIC mp_obj_t bit_transpose(size_t n_args, const mp_obj_t *pos_args, mp_map_t mp_buffer_info_t output_bufinfo; mp_get_buffer_raise(args[ARG_input].u_obj, &input_bufinfo, MP_BUFFER_READ); - int n = input_bufinfo.len; - if (n % 8 != 0) { - mp_raise_ValueError(translate("Input buffer must be a multiple of 8 bytes")); + int num_strands = args[ARG_num_strands].u_int; + + if (num_strands < 2 || num_strands > 8) { + mp_raise_ValueError(translate("num_strands must be from 2 to 8 (inclusive)")); + } + + int inlen = input_bufinfo.len; + if (inlen % num_strands != 0) { + mp_raise_ValueError_varg(translate("Input buffer length (%d) must be a multiple of the strand count (%d)"), inlen, num_strands); } mp_obj_t output = args[ARG_output].u_obj; + int outlen = 8 * (inlen / num_strands); if (!output || output == mp_const_none) { - output = mp_obj_new_bytearray_of_zeros(n); + output = mp_obj_new_bytearray_of_zeros(outlen); } mp_get_buffer_raise(output, &output_bufinfo, MP_BUFFER_WRITE); - int m = output_bufinfo.len; - if (m < n) { - mp_raise_ValueError(translate("Output buffer must be at least as big as input buffer")); + int avail = output_bufinfo.len; + if (avail < outlen) { + mp_raise_ValueError_varg(translate("Output buffer must be at least %d bytes"), outlen); } - common_hal_bit_transpose_bit_transpose(output_bufinfo.buf, input_bufinfo.buf, input_bufinfo.len); + common_hal_bit_transpose_bit_transpose(output_bufinfo.buf, input_bufinfo.buf, inlen, num_strands); return output; } STATIC MP_DEFINE_CONST_FUN_OBJ_KW(bit_transpose_bit_transpose_obj, 1, bit_transpose); diff --git a/shared-bindings/_bit_transpose/__init__.h b/shared-bindings/_bit_transpose/__init__.h index 75d57d980c..5fcf11d45b 100644 --- a/shared-bindings/_bit_transpose/__init__.h +++ b/shared-bindings/_bit_transpose/__init__.h @@ -29,4 +29,4 @@ #include #include -void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t n); +void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t inlen, size_t num_strands); diff --git a/shared-module/_bit_transpose/__init__.c b/shared-module/_bit_transpose/__init__.c index e30f989b93..517d2d840e 100644 --- a/shared-module/_bit_transpose/__init__.c +++ b/shared-module/_bit_transpose/__init__.c @@ -30,6 +30,12 @@ #include #include +#ifdef __GNUC__ +#define FALLTHROUGH __attribute__((fallthrough)) +#else +#define FALLTHROUGH ((void)0) /* FALLTHROUGH */ +#endif + // adapted from "Hacker's Delight" - Figure 7-2 Transposing an 8x8-bit matrix // basic idea is: // > First, treat the 8x8-bit matrix as 16 2x2-bit matrices, and transpose each @@ -40,7 +46,57 @@ // > illustrated below. // We want a different definition of bit/byte order, deal with strides differently, etc. // so the code is heavily re-worked compared to the original. -static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) { +static void transpose_var(uint32_t *result, const uint8_t *src, int src_stride, int num_strands) { + uint32_t x = 0, y = 0, t; + + src += (num_strands-1) * src_stride; + + switch(num_strands) { + case 7: + x |= *src << 16; + src -= src_stride; + FALLTHROUGH; + case 6: + x |= *src << 8; + src -= src_stride; + FALLTHROUGH; + case 5: + x |= *src; + src -= src_stride; + FALLTHROUGH; + case 4: + y |= *src << 24; + src -= src_stride; + FALLTHROUGH; + case 3: + y |= *src << 16; + src -= src_stride; + FALLTHROUGH; + case 2: + y |= *src << 8; + src -= src_stride; + y |= *src; + } + + t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7); + t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7); + + t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14); + t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14); + + t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F); + y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F); + x = t; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + x = __builtin_bswap32(x); + y = __builtin_bswap32(y); +#endif + result[0] = x; + result[1] = y; +} + +static void transpose_8(uint32_t *result, const uint8_t *src, int src_stride) { uint32_t x, y, t; y = *src; src += src_stride; @@ -70,14 +126,26 @@ static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) { result[1] = y; } -static void bit_transpose(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) { +static void bit_transpose_8(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) { for(size_t i=0; i