From c284728621fa28d4ef318d98db2b11cc95417e6a Mon Sep 17 00:00:00 2001
From: Jeff Epler <jepler@gmail.com>
Date: Thu, 18 Feb 2021 11:33:13 -0600
Subject: [PATCH] bit_transpose: Support from 2 to 7 strands, not just 8

---
 locale/circuitpython.pot                  | 10 ++-
 shared-bindings/_bit_transpose/__init__.c | 26 +++++---
 shared-bindings/_bit_transpose/__init__.h |  2 +-
 shared-module/_bit_transpose/__init__.c   | 78 +++++++++++++++++++++--
 4 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/locale/circuitpython.pot b/locale/circuitpython.pot
index 11335fd325..eef1b07f29 100644
--- a/locale/circuitpython.pot
+++ b/locale/circuitpython.pot
@@ -1093,7 +1093,8 @@ msgid "Initialization failed due to lack of memory"
 msgstr ""
 
 #: shared-bindings/_bit_transpose/__init__.c
-msgid "Input buffer must be a multiple of 8 bytes"
+#, c-format
+msgid "Input buffer length (%d) must be a multiple of the strand count (%d)"
 msgstr ""
 
 #: ports/atmel-samd/common-hal/pulseio/PulseIn.c
@@ -1664,7 +1665,8 @@ msgid "Out of sockets"
 msgstr ""
 
 #: shared-bindings/_bit_transpose/__init__.c
-msgid "Output buffer must be at least as big as input buffer"
+#, c-format
+msgid "Output buffer must be at least %d bytes"
 msgstr ""
 
 #: shared-bindings/audiobusio/PDMIn.c
@@ -3473,6 +3475,10 @@ msgstr ""
 msgid "not enough arguments for format string"
 msgstr ""
 
+#: shared-bindings/_bit_transpose/__init__.c
+msgid "num_strands must be from 2 to 8 (inclusive)"
+msgstr ""
+
 #: extmod/ulab/code/ulab_create.c
 msgid "number of points must be at least 2"
 msgstr ""
diff --git a/shared-bindings/_bit_transpose/__init__.c b/shared-bindings/_bit_transpose/__init__.c
index 38f66eb778..979dff1452 100644
--- a/shared-bindings/_bit_transpose/__init__.c
+++ b/shared-bindings/_bit_transpose/__init__.c
@@ -43,9 +43,10 @@
 //|     ...
 //|
 STATIC mp_obj_t bit_transpose(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
-    enum { ARG_input, ARG_output };
+    enum { ARG_input, ARG_num_strands, ARG_output };
     static const mp_arg_t allowed_args[] = {
         { MP_QSTR_input, MP_ARG_OBJ | MP_ARG_REQUIRED, {} },
+        { MP_QSTR_num_strands, MP_ARG_INT | MP_ARG_KW_ONLY, { .u_int = 8 } },
         { MP_QSTR_output, MP_ARG_OBJ | MP_ARG_KW_ONLY, { .u_obj = mp_const_none } },
     };
     mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
@@ -55,21 +56,28 @@ STATIC mp_obj_t bit_transpose(size_t n_args, const mp_obj_t *pos_args, mp_map_t
     mp_buffer_info_t output_bufinfo;
 
     mp_get_buffer_raise(args[ARG_input].u_obj, &input_bufinfo, MP_BUFFER_READ);
-    int n = input_bufinfo.len;
-    if (n % 8 != 0) {
-        mp_raise_ValueError(translate("Input buffer must be a multiple of 8 bytes"));
+    int num_strands = args[ARG_num_strands].u_int;
+
+    if (num_strands < 2 || num_strands > 8) {
+        mp_raise_ValueError(translate("num_strands must be from 2 to 8 (inclusive)"));
+    }
+
+    int inlen = input_bufinfo.len;
+    if (inlen % num_strands != 0) {
+        mp_raise_ValueError_varg(translate("Input buffer length (%d) must be a multiple of the strand count (%d)"), inlen, num_strands);
     }
     mp_obj_t output = args[ARG_output].u_obj;
 
+    int outlen = 8 * (inlen / num_strands);
     if (!output || output == mp_const_none) {
-        output = mp_obj_new_bytearray_of_zeros(n);
+        output = mp_obj_new_bytearray_of_zeros(outlen);
     }
     mp_get_buffer_raise(output, &output_bufinfo, MP_BUFFER_WRITE);
-    int m = output_bufinfo.len;
-    if (m < n) {
-        mp_raise_ValueError(translate("Output buffer must be at least as big as input buffer"));
+    int avail = output_bufinfo.len;
+    if (avail < outlen) {
+        mp_raise_ValueError_varg(translate("Output buffer must be at least %d bytes"), outlen);
     }
-    common_hal_bit_transpose_bit_transpose(output_bufinfo.buf, input_bufinfo.buf, input_bufinfo.len);
+    common_hal_bit_transpose_bit_transpose(output_bufinfo.buf, input_bufinfo.buf, inlen, num_strands);
     return output;
 }
 STATIC MP_DEFINE_CONST_FUN_OBJ_KW(bit_transpose_bit_transpose_obj, 1, bit_transpose);
diff --git a/shared-bindings/_bit_transpose/__init__.h b/shared-bindings/_bit_transpose/__init__.h
index 75d57d980c..5fcf11d45b 100644
--- a/shared-bindings/_bit_transpose/__init__.h
+++ b/shared-bindings/_bit_transpose/__init__.h
@@ -29,4 +29,4 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t n);
+void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t inlen, size_t num_strands);
diff --git a/shared-module/_bit_transpose/__init__.c b/shared-module/_bit_transpose/__init__.c
index e30f989b93..517d2d840e 100644
--- a/shared-module/_bit_transpose/__init__.c
+++ b/shared-module/_bit_transpose/__init__.c
@@ -30,6 +30,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __GNUC__
+#define FALLTHROUGH __attribute__((fallthrough))
+#else
+#define FALLTHROUGH ((void)0) /* FALLTHROUGH */
+#endif
+
 // adapted from "Hacker's Delight" - Figure 7-2 Transposing an 8x8-bit matrix
 // basic idea is:
 // > First, treat the 8x8-bit matrix as 16 2x2-bit matrices, and transpose each
@@ -40,7 +46,57 @@
 // > illustrated below.
 // We want a different definition of bit/byte order, deal with strides differently, etc.
 // so the code is heavily re-worked compared to the original.
-static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) {
+static void transpose_var(uint32_t *result, const uint8_t *src, int src_stride, int num_strands) {
+    uint32_t x = 0, y = 0, t;
+
+    src += (num_strands-1) * src_stride;
+
+    switch(num_strands) {
+    case 7:
+        x |= *src << 16;
+        src -= src_stride;
+        FALLTHROUGH;
+    case 6:
+        x |= *src << 8;
+        src -= src_stride;
+        FALLTHROUGH;
+    case 5:
+        x |= *src;
+        src -= src_stride;
+        FALLTHROUGH;
+    case 4:
+        y |= *src << 24;
+        src -= src_stride;
+        FALLTHROUGH;
+    case 3:
+        y |= *src << 16;
+        src -= src_stride;
+        FALLTHROUGH;
+    case 2:
+        y |= *src << 8;
+        src -= src_stride;
+        y |= *src;
+    }
+
+    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
+    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);
+
+    t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);
+    t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);
+
+    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
+    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
+    x = t;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    x = __builtin_bswap32(x);
+    y = __builtin_bswap32(y);
+#endif
+    result[0] = x;
+    result[1] = y;
+}
+
+static void transpose_8(uint32_t *result, const uint8_t *src, int src_stride) {
     uint32_t x, y, t;
 
     y = *src; src += src_stride;
@@ -70,14 +126,26 @@ static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) {
     result[1] = y;
 }
 
-static void bit_transpose(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) {
+static void bit_transpose_8(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) {
     for(size_t i=0; i<n; i++) {
-        transpose8(result, src, src_stride);
+        transpose_8(result, src, src_stride);
         result += 2;
         src += 1;
     }
 }
 
-void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t n) {
-    bit_transpose((uint32_t*)(void*)result, src, n/8, n/8);
+static void bit_transpose_var(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n, int num_strands) {
+    for(size_t i=0; i<n; i++) {
+        transpose_var(result, src, src_stride, num_strands);
+        result += 2;
+        src += 1;
+    }
+}
+
+void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t inlen, size_t num_strands) {
+    if(num_strands == 8) {
+        bit_transpose_8((uint32_t*)(void*)result, src, inlen/8, inlen/8);
+    } else  {
+        bit_transpose_var((uint32_t*)(void*)result, src, inlen/num_strands, inlen/num_strands, num_strands);
+    }
 }