From 11ef8f22fe7701cc75b6aaf2386670891eaacf92 Mon Sep 17 00:00:00 2001 From: Jim Mussared Date: Wed, 18 Aug 2021 14:52:48 +1000 Subject: [PATCH] py/map: Add an optional cache of (map+index) to speed up map lookups. The existing inline bytecode caching optimisation, selected by MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE, reserves an extra byte in the bytecode after certain opcodes, which at runtime stores a map index of the likely location of this field when looking up the qstr. This scheme is incompatible with bytecode-in-ROM, and doesn't work with native generated code. It also stores bytecode in .mpy files which is of a different format to when the feature is disabled, making generation of .mpy files more complex. This commit provides an alternative optimisation via an approach that adds a global cache for map offsets, then all mp_map_lookup operations use it. It's less precise than bytecode caching, but allows the cache to be independent and external to the bytecode that is executing. It also works for the native emitter and adds a similar performance boost on top of the gain already provided by the native emitter. Signed-off-by: Jim Mussared --- py/map.c | 35 +++++++++++++++++++++++++++++++++++ py/mpconfig.h | 14 ++++++++++++++ py/mpstate.h | 5 +++++ 3 files changed, 54 insertions(+) diff --git a/py/map.c b/py/map.c index 54f4b0204b..1c43616364 100644 --- a/py/map.c +++ b/py/map.c @@ -40,6 +40,27 @@ #define DEBUG_printf(...) (void)0 #endif +#if MICROPY_OPT_MAP_LOOKUP_CACHE +// MP_STATE_VM(map_lookup_cache) provides a cache of index to the last known +// position of that index in any map. On a cache hit, this allows +// short-circuiting the full linear search in the case of an ordered map +// (i.e. all builtin modules and objects' locals dicts), and computation of +// the hash (and potentially some linear probing) in the case of a regular +// map. Note the same cache is shared across all maps. + +// Gets the index into the cache for this index. Shift down by two to remove +// mp_obj_t tag bits. +#define MAP_CACHE_OFFSET(index) ((((uintptr_t)(index)) >> 2) % MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE) +// Gets the map cache entry for the corresponding index. +#define MAP_CACHE_ENTRY(index) (MP_STATE_VM(map_lookup_cache)[MAP_CACHE_OFFSET(index)]) +// Retrieve the mp_obj_t at the location suggested by the cache. +#define MAP_CACHE_GET(map, index) (&(map)->table[MAP_CACHE_ENTRY(index) % (map)->alloc]) +// Update the cache for this index. +#define MAP_CACHE_SET(index, pos) MAP_CACHE_ENTRY(index) = (pos) & 0xff; +#else +#define MAP_CACHE_SET(index, pos) +#endif + // This table of sizes is used to control the growth of hash tables. // The first set of sizes are chosen so the allocation fits exactly in a // 4-word GC block, and it's not so important for these small values to be @@ -136,6 +157,18 @@ mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t // If the map is a fixed array then we must only be called for a lookup assert(!map->is_fixed || lookup_kind == MP_MAP_LOOKUP); + #if MICROPY_OPT_MAP_LOOKUP_CACHE + // Try the cache for lookup or add-if-not-found. + if (lookup_kind != MP_MAP_LOOKUP_REMOVE_IF_FOUND && map->alloc) { + mp_map_elem_t *slot = MAP_CACHE_GET(map, index); + // Note: Just comparing key for value equality will have false negatives, but + // these will be handled by the regular path below. + if (slot->key == index) { + return slot; + } + } + #endif + // Work out if we can compare just pointers bool compare_only_ptrs = map->all_keys_are_qstrs; if (compare_only_ptrs) { @@ -172,6 +205,7 @@ mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t elem->value = value; } #endif + MAP_CACHE_SET(index, elem - map->table); return elem; } } @@ -254,6 +288,7 @@ mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t } // keep slot->value so that caller can access it if needed } + MAP_CACHE_SET(index, pos); return slot; } diff --git a/py/mpconfig.h b/py/mpconfig.h index 2fbf1490d8..97fc9bb580 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -533,6 +533,20 @@ #define MICROPY_OPT_LOAD_ATTR_FAST_PATH (0) #endif +// Use extra RAM to cache map lookups by remembering the likely location of +// the index. Avoids the hash computation on unordered maps, and avoids the +// linear search on ordered (especially in-ROM) maps. Can provide a +10-15% +// performance improvement on benchmarks involving lots of attribute access +// or dictionary lookup. +#ifndef MICROPY_OPT_MAP_LOOKUP_CACHE +#define MICROPY_OPT_MAP_LOOKUP_CACHE (0) +#endif + +// How much RAM (in bytes) to use for the map lookup cache. +#ifndef MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE +#define MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE (128) +#endif + // Whether to use fast versions of bitwise operations (and, or, xor) when the // arguments are both positive. Increases Thumb2 code size by about 250 bytes. #ifndef MICROPY_OPT_MPZ_BITWISE diff --git a/py/mpstate.h b/py/mpstate.h index 07335bae4c..53b2906872 100644 --- a/py/mpstate.h +++ b/py/mpstate.h @@ -231,6 +231,11 @@ typedef struct _mp_state_vm_t { // This is a global mutex used to make the VM/runtime thread-safe. mp_thread_mutex_t gil_mutex; #endif + + #if MICROPY_OPT_MAP_LOOKUP_CACHE + // See mp_map_lookup. + uint8_t map_lookup_cache[MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE]; + #endif } mp_state_vm_t; // This structure holds state that is specific to a given thread.