circuitpython/py/mpstate.h
Jim Mussared b326edf68c all: Remove MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE.
This commit removes all parts of code associated with the existing
MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE optimisation option, including the
-mcache-lookup-bc option to mpy-cross.

This feature originally provided a significant performance boost for Unix,
but wasn't able to be enabled for MCU targets (due to frozen bytecode), and
added significant extra complexity to generating and distributing .mpy
files.

The equivalent performance gain is now provided by the combination of
MICROPY_OPT_LOAD_ATTR_FAST_PATH and MICROPY_OPT_MAP_LOOKUP_CACHE (which has
been enabled on the unix port in the previous commit).

It's hard to provide precise performance numbers, but tests have been run
on a wide variety of architectures (x86-64, ARM Cortex, Aarch64, RISC-V,
xtensa) and they all generally agree on the qualitative improvements seen
by the combination of MICROPY_OPT_LOAD_ATTR_FAST_PATH and
MICROPY_OPT_MAP_LOOKUP_CACHE.

For example, on a "quiet" Linux x64 environment (i3-5010U @ 2.10GHz) the
change from CACHE_MAP_LOOKUP_IN_BYTECODE, to LOAD_ATTR_FAST_PATH combined
with MAP_LOOKUP_CACHE is:

diff of scores (higher is better)
N=2000 M=2000       bccache -> attrmapcache      diff      diff% (error%)
bm_chaos.py        13742.56 ->   13905.67 :   +163.11 =  +1.187% (+/-3.75%)
bm_fannkuch.py        60.13 ->      61.34 :     +1.21 =  +2.012% (+/-2.11%)
bm_fft.py         113083.20 ->  114793.68 :  +1710.48 =  +1.513% (+/-1.57%)
bm_float.py       256552.80 ->  243908.29 : -12644.51 =  -4.929% (+/-1.90%)
bm_hexiom.py         521.93 ->     625.41 :   +103.48 = +19.826% (+/-0.40%)
bm_nqueens.py     197544.25 ->  217713.12 : +20168.87 = +10.210% (+/-3.01%)
bm_pidigits.py      8072.98 ->    8198.75 :   +125.77 =  +1.558% (+/-3.22%)
misc_aes.py        17283.45 ->   16480.52 :   -802.93 =  -4.646% (+/-0.82%)
misc_mandel.py     99083.99 ->  128939.84 : +29855.85 = +30.132% (+/-5.88%)
misc_pystone.py    83860.10 ->   82592.56 :  -1267.54 =  -1.511% (+/-2.27%)
misc_raytrace.py   21490.40 ->   22227.23 :   +736.83 =  +3.429% (+/-1.88%)

This shows that the new optimisations are at least as good as the existing
inline-bytecode-caching, and are sometimes much better (because the new
ones apply caching to a wider variety of map lookups).

The new optimisations can also benefit code generated by the native
emitter, because they apply to the runtime rather than the generated code.
The improvement for the native emitter when LOAD_ATTR_FAST_PATH and
MAP_LOOKUP_CACHE are enabled is (same Linux environment as above):

diff of scores (higher is better)
N=2000 M=2000        native -> nat-attrmapcache  diff      diff% (error%)
bm_chaos.py        14130.62 ->   15464.68 :  +1334.06 =  +9.441% (+/-7.11%)
bm_fannkuch.py        74.96 ->      76.16 :     +1.20 =  +1.601% (+/-1.80%)
bm_fft.py         166682.99 ->  168221.86 :  +1538.87 =  +0.923% (+/-4.20%)
bm_float.py       233415.23 ->  265524.90 : +32109.67 = +13.756% (+/-2.57%)
bm_hexiom.py         628.59 ->     734.17 :   +105.58 = +16.796% (+/-1.39%)
bm_nqueens.py     225418.44 ->  232926.45 :  +7508.01 =  +3.331% (+/-3.10%)
bm_pidigits.py      6322.00 ->    6379.52 :    +57.52 =  +0.910% (+/-5.62%)
misc_aes.py        20670.10 ->   27223.18 :  +6553.08 = +31.703% (+/-1.56%)
misc_mandel.py    138221.11 ->  152014.01 : +13792.90 =  +9.979% (+/-2.46%)
misc_pystone.py    85032.14 ->  105681.44 : +20649.30 = +24.284% (+/-2.25%)
misc_raytrace.py   19800.01 ->   23350.73 :  +3550.72 = +17.933% (+/-2.79%)

In summary, compared to MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE, the new
MICROPY_OPT_LOAD_ATTR_FAST_PATH and MICROPY_OPT_MAP_LOOKUP_CACHE options:
- are simpler;
- take less code size;
- are faster (generally);
- work with code generated by the native emitter;
- can be used on embedded targets with a small and constant RAM overhead;
- allow the same .mpy bytecode to run on all targets.

See #7680 for further discussion.  And see also #7653 for a discussion
about simplifying mpy-cross options.

Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
2021-09-16 16:04:03 +10:00

305 lines
9.2 KiB
C

/*
* This file is part of the MicroPython project, http://micropython.org/
*
* The MIT License (MIT)
*
* Copyright (c) 2014 Damien P. George
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MICROPY_INCLUDED_PY_MPSTATE_H
#define MICROPY_INCLUDED_PY_MPSTATE_H
#include <stdint.h>
#include "py/mpconfig.h"
#include "py/mpthread.h"
#include "py/misc.h"
#include "py/nlr.h"
#include "py/obj.h"
#include "py/objlist.h"
#include "py/objexcept.h"
// This file contains structures defining the state of the MicroPython
// memory system, runtime and virtual machine. The state is a global
// variable, but in the future it is hoped that the state can become local.
// This structure contains dynamic configuration for the compiler.
#if MICROPY_DYNAMIC_COMPILER
typedef struct mp_dynamic_compiler_t {
uint8_t small_int_bits; // must be <= host small_int_bits
bool py_builtins_str_unicode;
uint8_t native_arch;
uint8_t nlr_buf_num_regs;
} mp_dynamic_compiler_t;
extern mp_dynamic_compiler_t mp_dynamic_compiler;
#endif
// These are the values for sched_state
#define MP_SCHED_IDLE (1)
#define MP_SCHED_LOCKED (-1)
#define MP_SCHED_PENDING (0) // 0 so it's a quick check in the VM
typedef struct _mp_sched_item_t {
mp_obj_t func;
mp_obj_t arg;
} mp_sched_item_t;
// This structure hold information about the memory allocation system.
typedef struct _mp_state_mem_t {
#if MICROPY_MEM_STATS
size_t total_bytes_allocated;
size_t current_bytes_allocated;
size_t peak_bytes_allocated;
#endif
byte *gc_alloc_table_start;
size_t gc_alloc_table_byte_len;
#if MICROPY_ENABLE_FINALISER
byte *gc_finaliser_table_start;
#endif
byte *gc_pool_start;
byte *gc_pool_end;
int gc_stack_overflow;
MICROPY_GC_STACK_ENTRY_TYPE gc_stack[MICROPY_ALLOC_GC_STACK_SIZE];
// This variable controls auto garbage collection. If set to 0 then the
// GC won't automatically run when gc_alloc can't find enough blocks. But
// you can still allocate/free memory and also explicitly call gc_collect.
uint16_t gc_auto_collect_enabled;
#if MICROPY_GC_ALLOC_THRESHOLD
size_t gc_alloc_amount;
size_t gc_alloc_threshold;
#endif
size_t gc_last_free_atb_index;
#if MICROPY_PY_GC_COLLECT_RETVAL
size_t gc_collected;
#endif
#if MICROPY_PY_THREAD && !MICROPY_PY_THREAD_GIL
// This is a global mutex used to make the GC thread-safe.
mp_thread_mutex_t gc_mutex;
#endif
} mp_state_mem_t;
// This structure hold runtime and VM information. It includes a section
// which contains root pointers that must be scanned by the GC.
typedef struct _mp_state_vm_t {
//
// CONTINUE ROOT POINTER SECTION
// This must start at the start of this structure and follows
// the state in the mp_state_thread_t structure, continuing
// the root pointer section from there.
//
qstr_pool_t *last_pool;
// non-heap memory for creating an exception if we can't allocate RAM
mp_obj_exception_t mp_emergency_exception_obj;
// memory for exception arguments if we can't allocate RAM
#if MICROPY_ENABLE_EMERGENCY_EXCEPTION_BUF
#if MICROPY_EMERGENCY_EXCEPTION_BUF_SIZE > 0
// statically allocated buf (needs to be aligned to mp_obj_t)
mp_obj_t mp_emergency_exception_buf[MICROPY_EMERGENCY_EXCEPTION_BUF_SIZE / sizeof(mp_obj_t)];
#else
// dynamically allocated buf
byte *mp_emergency_exception_buf;
#endif
#endif
#if MICROPY_KBD_EXCEPTION
// exception object of type KeyboardInterrupt
mp_obj_exception_t mp_kbd_exception;
#endif
// dictionary with loaded modules (may be exposed as sys.modules)
mp_obj_dict_t mp_loaded_modules_dict;
#if MICROPY_ENABLE_SCHEDULER
mp_sched_item_t sched_queue[MICROPY_SCHEDULER_DEPTH];
#endif
// current exception being handled, for sys.exc_info()
#if MICROPY_PY_SYS_EXC_INFO
mp_obj_base_t *cur_exception;
#endif
#if MICROPY_PY_SYS_ATEXIT
// exposed through sys.atexit function
mp_obj_t sys_exitfunc;
#endif
// dictionary for the __main__ module
mp_obj_dict_t dict_main;
// these two lists must be initialised per port, after the call to mp_init
mp_obj_list_t mp_sys_path_obj;
mp_obj_list_t mp_sys_argv_obj;
// dictionary for overridden builtins
#if MICROPY_CAN_OVERRIDE_BUILTINS
mp_obj_dict_t *mp_module_builtins_override_dict;
#endif
#if MICROPY_PERSISTENT_CODE_TRACK_RELOC_CODE
// An mp_obj_list_t that tracks relocated native code to prevent the GC from reclaiming them.
mp_obj_t track_reloc_code_list;
#endif
// include any root pointers defined by a port
MICROPY_PORT_ROOT_POINTERS
// root pointers for extmod
#if MICROPY_REPL_EVENT_DRIVEN
vstr_t *repl_line;
#endif
#if MICROPY_PY_OS_DUPTERM
mp_obj_t dupterm_objs[MICROPY_PY_OS_DUPTERM];
#endif
#if MICROPY_PY_LWIP_SLIP
mp_obj_t lwip_slip_stream;
#endif
#if MICROPY_VFS
struct _mp_vfs_mount_t *vfs_cur;
struct _mp_vfs_mount_t *vfs_mount_table;
#endif
#if MICROPY_PY_BLUETOOTH
mp_obj_t bluetooth;
#endif
//
// END ROOT POINTER SECTION
////////////////////////////////////////////////////////////
// pointer and sizes to store interned string data
// (qstr_last_chunk can be root pointer but is also stored in qstr pool)
byte *qstr_last_chunk;
size_t qstr_last_alloc;
size_t qstr_last_used;
#if MICROPY_PY_THREAD && !MICROPY_PY_THREAD_GIL
// This is a global mutex used to make qstr interning thread-safe.
mp_thread_mutex_t qstr_mutex;
#endif
#if MICROPY_ENABLE_COMPILER
mp_uint_t mp_optimise_value;
#if MICROPY_EMIT_NATIVE
uint8_t default_emit_opt; // one of MP_EMIT_OPT_xxx
#endif
#endif
// size of the emergency exception buf, if it's dynamically allocated
#if MICROPY_ENABLE_EMERGENCY_EXCEPTION_BUF && MICROPY_EMERGENCY_EXCEPTION_BUF_SIZE == 0
mp_int_t mp_emergency_exception_buf_size;
#endif
#if MICROPY_ENABLE_SCHEDULER
volatile int16_t sched_state;
uint8_t sched_len;
uint8_t sched_idx;
#endif
#if MICROPY_PY_THREAD_GIL
// This is a global mutex used to make the VM/runtime thread-safe.
mp_thread_mutex_t gil_mutex;
#endif
#if MICROPY_OPT_MAP_LOOKUP_CACHE
// See mp_map_lookup.
uint8_t map_lookup_cache[MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE];
#endif
} mp_state_vm_t;
// This structure holds state that is specific to a given thread.
// Everything in this structure is scanned for root pointers.
typedef struct _mp_state_thread_t {
// Stack top at the start of program
char *stack_top;
#if MICROPY_STACK_CHECK
size_t stack_limit;
#endif
#if MICROPY_ENABLE_PYSTACK
uint8_t *pystack_start;
uint8_t *pystack_end;
uint8_t *pystack_cur;
#endif
// Locking of the GC is done per thread.
uint16_t gc_lock_depth;
////////////////////////////////////////////////////////////
// START ROOT POINTER SECTION
// Everything that needs GC scanning must start here, and
// is followed by state in the mp_state_vm_t structure.
//
mp_obj_dict_t *dict_locals;
mp_obj_dict_t *dict_globals;
nlr_buf_t *nlr_top;
// pending exception object (MP_OBJ_NULL if not pending)
volatile mp_obj_t mp_pending_exception;
// If MP_OBJ_STOP_ITERATION is propagated then this holds its argument.
mp_obj_t stop_iteration_arg;
#if MICROPY_PY_SYS_SETTRACE
mp_obj_t prof_trace_callback;
bool prof_callback_is_executing;
struct _mp_code_state_t *current_code_state;
#endif
} mp_state_thread_t;
// This structure combines the above 3 structures.
// The order of the entries are important for root pointer scanning in the GC to work.
typedef struct _mp_state_ctx_t {
mp_state_thread_t thread;
mp_state_vm_t vm;
mp_state_mem_t mem;
} mp_state_ctx_t;
extern mp_state_ctx_t mp_state_ctx;
#define MP_STATE_VM(x) (mp_state_ctx.vm.x)
#define MP_STATE_MEM(x) (mp_state_ctx.mem.x)
#define MP_STATE_MAIN_THREAD(x) (mp_state_ctx.thread.x)
#if MICROPY_PY_THREAD
extern mp_state_thread_t *mp_thread_get_state(void);
#define MP_STATE_THREAD(x) (mp_thread_get_state()->x)
#else
#define MP_STATE_THREAD(x) MP_STATE_MAIN_THREAD(x)
#endif
#endif // MICROPY_INCLUDED_PY_MPSTATE_H