circuitpython/py/emit.h

323 lines
14 KiB
C
Raw Normal View History

/*
* This file is part of the MicroPython project, http://micropython.org/
*
* The MIT License (MIT)
*
* Copyright (c) 2013, 2014 Damien P. George
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MICROPY_INCLUDED_PY_EMIT_H
#define MICROPY_INCLUDED_PY_EMIT_H
#include "py/lexer.h"
#include "py/scope.h"
2013-10-04 14:53:11 -04:00
/* Notes on passes:
* We don't know exactly the opcodes in pass 1 because they depend on the
* closing over of variables (LOAD_CLOSURE, BUILD_TUPLE, MAKE_CLOSURE), which
* depends on determining the scope of variables in each function, and this
* is not known until the end of pass 1.
* As a consequence, we don't know the maximum stack size until the end of pass 2.
* This is problematic for some emitters (x64) since they need to know the maximum
2014-01-06 10:49:21 -05:00
* stack size to compile the entry to the function, and this affects code size.
2013-10-04 14:53:11 -04:00
*/
typedef enum {
MP_PASS_SCOPE = 1, // work out id's and their kind, and number of labels
MP_PASS_STACK_SIZE = 2, // work out maximum stack size
MP_PASS_CODE_SIZE = 3, // work out code size and label offsets
MP_PASS_EMIT = 4, // emit code
2013-10-04 14:53:11 -04:00
} pass_kind_t;
#define MP_EMIT_STAR_FLAG_SINGLE (0x01)
#define MP_EMIT_STAR_FLAG_DOUBLE (0x02)
#define MP_EMIT_BREAK_FROM_FOR (0x8000)
// Kind for emit_id_ops->local()
#define MP_EMIT_IDOP_LOCAL_FAST (0)
#define MP_EMIT_IDOP_LOCAL_DEREF (1)
// Kind for emit_id_ops->global()
#define MP_EMIT_IDOP_GLOBAL_NAME (0)
#define MP_EMIT_IDOP_GLOBAL_GLOBAL (1)
// Kind for emit->import()
#define MP_EMIT_IMPORT_NAME (0)
#define MP_EMIT_IMPORT_FROM (1)
#define MP_EMIT_IMPORT_STAR (2)
// Kind for emit->subscr()
#define MP_EMIT_SUBSCR_LOAD (0)
#define MP_EMIT_SUBSCR_STORE (1)
#define MP_EMIT_SUBSCR_DELETE (2)
// Kind for emit->attr()
#define MP_EMIT_ATTR_LOAD (0)
#define MP_EMIT_ATTR_STORE (1)
#define MP_EMIT_ATTR_DELETE (2)
// Kind for emit->setup_block()
#define MP_EMIT_SETUP_BLOCK_WITH (0)
#define MP_EMIT_SETUP_BLOCK_EXCEPT (1)
#define MP_EMIT_SETUP_BLOCK_FINALLY (2)
// Kind for emit->build()
#define MP_EMIT_BUILD_TUPLE (0)
#define MP_EMIT_BUILD_LIST (1)
#define MP_EMIT_BUILD_MAP (2)
#define MP_EMIT_BUILD_SET (3)
#define MP_EMIT_BUILD_SLICE (4)
// Kind for emit->yield()
#define MP_EMIT_YIELD_VALUE (0)
#define MP_EMIT_YIELD_FROM (1)
typedef struct _emit_t emit_t;
py: Rework bytecode and .mpy file format to be mostly static data. Background: .mpy files are precompiled .py files, built using mpy-cross, that contain compiled bytecode functions (and can also contain machine code). The benefit of using an .mpy file over a .py file is that they are faster to import and take less memory when importing. They are also smaller on disk. But the real benefit of .mpy files comes when they are frozen into the firmware. This is done by loading the .mpy file during compilation of the firmware and turning it into a set of big C data structures (the job of mpy-tool.py), which are then compiled and downloaded into the ROM of a device. These C data structures can be executed in-place, ie directly from ROM. This makes importing even faster because there is very little to do, and also means such frozen modules take up much less RAM (because their bytecode stays in ROM). The downside of frozen code is that it requires recompiling and reflashing the entire firmware. This can be a big barrier to entry, slows down development time, and makes it harder to do OTA updates of frozen code (because the whole firmware must be updated). This commit attempts to solve this problem by providing a solution that sits between loading .mpy files into RAM and freezing them into the firmware. The .mpy file format has been reworked so that it consists of data and bytecode which is mostly static and ready to run in-place. If these new .mpy files are located in flash/ROM which is memory addressable, the .mpy file can be executed (mostly) in-place. With this approach there is still a small amount of unpacking and linking of the .mpy file that needs to be done when it's imported, but it's still much better than loading an .mpy from disk into RAM (although not as good as freezing .mpy files into the firmware). The main trick to make static .mpy files is to adjust the bytecode so any qstrs that it references now go through a lookup table to convert from local qstr number in the module to global qstr number in the firmware. That means the bytecode does not need linking/rewriting of qstrs when it's loaded. Instead only a small qstr table needs to be built (and put in RAM) at import time. This means the bytecode itself is static/constant and can be used directly if it's in addressable memory. Also the qstr string data in the .mpy file, and some constant object data, can be used directly. Note that the qstr table is global to the module (ie not per function). In more detail, in the VM what used to be (schematically): qst = DECODE_QSTR_VALUE; is now (schematically): idx = DECODE_QSTR_INDEX; qst = qstr_table[idx]; That allows the bytecode to be fixed at compile time and not need relinking/rewriting of the qstr values. Only qstr_table needs to be linked when the .mpy is loaded. Incidentally, this helps to reduce the size of bytecode because what used to be 2-byte qstr values in the bytecode are now (mostly) 1-byte indices. If the module uses the same qstr more than two times then the bytecode is smaller than before. The following changes are measured for this commit compared to the previous (the baseline): - average 7%-9% reduction in size of .mpy files - frozen code size is reduced by about 5%-7% - importing .py files uses about 5% less RAM in total - importing .mpy files uses about 4% less RAM in total - importing .py and .mpy files takes about the same time as before The qstr indirection in the bytecode has only a small impact on VM performance. For stm32 on PYBv1.0 the performance change of this commit is: diff of scores (higher is better) N=100 M=100 baseline -> this-commit diff diff% (error%) bm_chaos.py 371.07 -> 357.39 : -13.68 = -3.687% (+/-0.02%) bm_fannkuch.py 78.72 -> 77.49 : -1.23 = -1.563% (+/-0.01%) bm_fft.py 2591.73 -> 2539.28 : -52.45 = -2.024% (+/-0.00%) bm_float.py 6034.93 -> 5908.30 : -126.63 = -2.098% (+/-0.01%) bm_hexiom.py 48.96 -> 47.93 : -1.03 = -2.104% (+/-0.00%) bm_nqueens.py 4510.63 -> 4459.94 : -50.69 = -1.124% (+/-0.00%) bm_pidigits.py 650.28 -> 644.96 : -5.32 = -0.818% (+/-0.23%) core_import_mpy_multi.py 564.77 -> 581.49 : +16.72 = +2.960% (+/-0.01%) core_import_mpy_single.py 68.67 -> 67.16 : -1.51 = -2.199% (+/-0.01%) core_qstr.py 64.16 -> 64.12 : -0.04 = -0.062% (+/-0.00%) core_yield_from.py 362.58 -> 354.50 : -8.08 = -2.228% (+/-0.00%) misc_aes.py 429.69 -> 405.59 : -24.10 = -5.609% (+/-0.01%) misc_mandel.py 3485.13 -> 3416.51 : -68.62 = -1.969% (+/-0.00%) misc_pystone.py 2496.53 -> 2405.56 : -90.97 = -3.644% (+/-0.01%) misc_raytrace.py 381.47 -> 374.01 : -7.46 = -1.956% (+/-0.01%) viper_call0.py 576.73 -> 572.49 : -4.24 = -0.735% (+/-0.04%) viper_call1a.py 550.37 -> 546.21 : -4.16 = -0.756% (+/-0.09%) viper_call1b.py 438.23 -> 435.68 : -2.55 = -0.582% (+/-0.06%) viper_call1c.py 442.84 -> 440.04 : -2.80 = -0.632% (+/-0.08%) viper_call2a.py 536.31 -> 532.35 : -3.96 = -0.738% (+/-0.06%) viper_call2b.py 382.34 -> 377.07 : -5.27 = -1.378% (+/-0.03%) And for unix on x64: diff of scores (higher is better) N=2000 M=2000 baseline -> this-commit diff diff% (error%) bm_chaos.py 13594.20 -> 13073.84 : -520.36 = -3.828% (+/-5.44%) bm_fannkuch.py 60.63 -> 59.58 : -1.05 = -1.732% (+/-3.01%) bm_fft.py 112009.15 -> 111603.32 : -405.83 = -0.362% (+/-4.03%) bm_float.py 246202.55 -> 247923.81 : +1721.26 = +0.699% (+/-2.79%) bm_hexiom.py 615.65 -> 617.21 : +1.56 = +0.253% (+/-1.64%) bm_nqueens.py 215807.95 -> 215600.96 : -206.99 = -0.096% (+/-3.52%) bm_pidigits.py 8246.74 -> 8422.82 : +176.08 = +2.135% (+/-3.64%) misc_aes.py 16133.00 -> 16452.74 : +319.74 = +1.982% (+/-1.50%) misc_mandel.py 128146.69 -> 130796.43 : +2649.74 = +2.068% (+/-3.18%) misc_pystone.py 83811.49 -> 83124.85 : -686.64 = -0.819% (+/-1.03%) misc_raytrace.py 21688.02 -> 21385.10 : -302.92 = -1.397% (+/-3.20%) The code size change is (firmware with a lot of frozen code benefits the most): bare-arm: +396 +0.697% minimal x86: +1595 +0.979% [incl +32(data)] unix x64: +2408 +0.470% [incl +800(data)] unix nanbox: +1396 +0.309% [incl -96(data)] stm32: -1256 -0.318% PYBV10 cc3200: +288 +0.157% esp8266: -260 -0.037% GENERIC esp32: -216 -0.014% GENERIC[incl -1072(data)] nrf: +116 +0.067% pca10040 rp2: -664 -0.135% PICO samd: +844 +0.607% ADAFRUIT_ITSYBITSY_M4_EXPRESS As part of this change the .mpy file format version is bumped to version 6. And mpy-tool.py has been improved to provide a good visualisation of the contents of .mpy files. In summary: this commit changes the bytecode to use qstr indirection, and reworks the .mpy file format to be simpler and allow .mpy files to be executed in-place. Performance is not impacted too much. Eventually it will be possible to store such .mpy files in a linear, read-only, memory- mappable filesystem so they can be executed from flash/ROM. This will essentially be able to replace frozen code for most applications. Signed-off-by: Damien George <damien@micropython.org>
2021-10-22 07:22:47 -04:00
typedef struct _mp_emit_common_t {
pass_kind_t pass;
uint16_t ct_cur_obj_base;
uint16_t ct_cur_obj;
uint16_t ct_cur_child;
mp_uint_t *const_table;
mp_raw_code_t **children;
#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
mp_map_t qstr_map;
#endif
} mp_emit_common_t;
typedef struct _mp_emit_method_table_id_ops_t {
void (*local)(emit_t *emit, qstr qst, mp_uint_t local_num, int kind);
void (*global)(emit_t *emit, qstr qst, int kind);
} mp_emit_method_table_id_ops_t;
typedef struct _emit_method_table_t {
#if MICROPY_DYNAMIC_COMPILER
py: Rework bytecode and .mpy file format to be mostly static data. Background: .mpy files are precompiled .py files, built using mpy-cross, that contain compiled bytecode functions (and can also contain machine code). The benefit of using an .mpy file over a .py file is that they are faster to import and take less memory when importing. They are also smaller on disk. But the real benefit of .mpy files comes when they are frozen into the firmware. This is done by loading the .mpy file during compilation of the firmware and turning it into a set of big C data structures (the job of mpy-tool.py), which are then compiled and downloaded into the ROM of a device. These C data structures can be executed in-place, ie directly from ROM. This makes importing even faster because there is very little to do, and also means such frozen modules take up much less RAM (because their bytecode stays in ROM). The downside of frozen code is that it requires recompiling and reflashing the entire firmware. This can be a big barrier to entry, slows down development time, and makes it harder to do OTA updates of frozen code (because the whole firmware must be updated). This commit attempts to solve this problem by providing a solution that sits between loading .mpy files into RAM and freezing them into the firmware. The .mpy file format has been reworked so that it consists of data and bytecode which is mostly static and ready to run in-place. If these new .mpy files are located in flash/ROM which is memory addressable, the .mpy file can be executed (mostly) in-place. With this approach there is still a small amount of unpacking and linking of the .mpy file that needs to be done when it's imported, but it's still much better than loading an .mpy from disk into RAM (although not as good as freezing .mpy files into the firmware). The main trick to make static .mpy files is to adjust the bytecode so any qstrs that it references now go through a lookup table to convert from local qstr number in the module to global qstr number in the firmware. That means the bytecode does not need linking/rewriting of qstrs when it's loaded. Instead only a small qstr table needs to be built (and put in RAM) at import time. This means the bytecode itself is static/constant and can be used directly if it's in addressable memory. Also the qstr string data in the .mpy file, and some constant object data, can be used directly. Note that the qstr table is global to the module (ie not per function). In more detail, in the VM what used to be (schematically): qst = DECODE_QSTR_VALUE; is now (schematically): idx = DECODE_QSTR_INDEX; qst = qstr_table[idx]; That allows the bytecode to be fixed at compile time and not need relinking/rewriting of the qstr values. Only qstr_table needs to be linked when the .mpy is loaded. Incidentally, this helps to reduce the size of bytecode because what used to be 2-byte qstr values in the bytecode are now (mostly) 1-byte indices. If the module uses the same qstr more than two times then the bytecode is smaller than before. The following changes are measured for this commit compared to the previous (the baseline): - average 7%-9% reduction in size of .mpy files - frozen code size is reduced by about 5%-7% - importing .py files uses about 5% less RAM in total - importing .mpy files uses about 4% less RAM in total - importing .py and .mpy files takes about the same time as before The qstr indirection in the bytecode has only a small impact on VM performance. For stm32 on PYBv1.0 the performance change of this commit is: diff of scores (higher is better) N=100 M=100 baseline -> this-commit diff diff% (error%) bm_chaos.py 371.07 -> 357.39 : -13.68 = -3.687% (+/-0.02%) bm_fannkuch.py 78.72 -> 77.49 : -1.23 = -1.563% (+/-0.01%) bm_fft.py 2591.73 -> 2539.28 : -52.45 = -2.024% (+/-0.00%) bm_float.py 6034.93 -> 5908.30 : -126.63 = -2.098% (+/-0.01%) bm_hexiom.py 48.96 -> 47.93 : -1.03 = -2.104% (+/-0.00%) bm_nqueens.py 4510.63 -> 4459.94 : -50.69 = -1.124% (+/-0.00%) bm_pidigits.py 650.28 -> 644.96 : -5.32 = -0.818% (+/-0.23%) core_import_mpy_multi.py 564.77 -> 581.49 : +16.72 = +2.960% (+/-0.01%) core_import_mpy_single.py 68.67 -> 67.16 : -1.51 = -2.199% (+/-0.01%) core_qstr.py 64.16 -> 64.12 : -0.04 = -0.062% (+/-0.00%) core_yield_from.py 362.58 -> 354.50 : -8.08 = -2.228% (+/-0.00%) misc_aes.py 429.69 -> 405.59 : -24.10 = -5.609% (+/-0.01%) misc_mandel.py 3485.13 -> 3416.51 : -68.62 = -1.969% (+/-0.00%) misc_pystone.py 2496.53 -> 2405.56 : -90.97 = -3.644% (+/-0.01%) misc_raytrace.py 381.47 -> 374.01 : -7.46 = -1.956% (+/-0.01%) viper_call0.py 576.73 -> 572.49 : -4.24 = -0.735% (+/-0.04%) viper_call1a.py 550.37 -> 546.21 : -4.16 = -0.756% (+/-0.09%) viper_call1b.py 438.23 -> 435.68 : -2.55 = -0.582% (+/-0.06%) viper_call1c.py 442.84 -> 440.04 : -2.80 = -0.632% (+/-0.08%) viper_call2a.py 536.31 -> 532.35 : -3.96 = -0.738% (+/-0.06%) viper_call2b.py 382.34 -> 377.07 : -5.27 = -1.378% (+/-0.03%) And for unix on x64: diff of scores (higher is better) N=2000 M=2000 baseline -> this-commit diff diff% (error%) bm_chaos.py 13594.20 -> 13073.84 : -520.36 = -3.828% (+/-5.44%) bm_fannkuch.py 60.63 -> 59.58 : -1.05 = -1.732% (+/-3.01%) bm_fft.py 112009.15 -> 111603.32 : -405.83 = -0.362% (+/-4.03%) bm_float.py 246202.55 -> 247923.81 : +1721.26 = +0.699% (+/-2.79%) bm_hexiom.py 615.65 -> 617.21 : +1.56 = +0.253% (+/-1.64%) bm_nqueens.py 215807.95 -> 215600.96 : -206.99 = -0.096% (+/-3.52%) bm_pidigits.py 8246.74 -> 8422.82 : +176.08 = +2.135% (+/-3.64%) misc_aes.py 16133.00 -> 16452.74 : +319.74 = +1.982% (+/-1.50%) misc_mandel.py 128146.69 -> 130796.43 : +2649.74 = +2.068% (+/-3.18%) misc_pystone.py 83811.49 -> 83124.85 : -686.64 = -0.819% (+/-1.03%) misc_raytrace.py 21688.02 -> 21385.10 : -302.92 = -1.397% (+/-3.20%) The code size change is (firmware with a lot of frozen code benefits the most): bare-arm: +396 +0.697% minimal x86: +1595 +0.979% [incl +32(data)] unix x64: +2408 +0.470% [incl +800(data)] unix nanbox: +1396 +0.309% [incl -96(data)] stm32: -1256 -0.318% PYBV10 cc3200: +288 +0.157% esp8266: -260 -0.037% GENERIC esp32: -216 -0.014% GENERIC[incl -1072(data)] nrf: +116 +0.067% pca10040 rp2: -664 -0.135% PICO samd: +844 +0.607% ADAFRUIT_ITSYBITSY_M4_EXPRESS As part of this change the .mpy file format version is bumped to version 6. And mpy-tool.py has been improved to provide a good visualisation of the contents of .mpy files. In summary: this commit changes the bytecode to use qstr indirection, and reworks the .mpy file format to be simpler and allow .mpy files to be executed in-place. Performance is not impacted too much. Eventually it will be possible to store such .mpy files in a linear, read-only, memory- mappable filesystem so they can be executed from flash/ROM. This will essentially be able to replace frozen code for most applications. Signed-off-by: Damien George <damien@micropython.org>
2021-10-22 07:22:47 -04:00
emit_t *(*emit_new)(mp_emit_common_t * emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
void (*emit_free)(emit_t *emit);
#endif
void (*start_pass)(emit_t *emit, pass_kind_t pass, scope_t *scope);
void (*end_pass)(emit_t *emit);
bool (*last_emit_was_return_value)(emit_t *emit);
void (*adjust_stack_size)(emit_t *emit, mp_int_t delta);
void (*set_source_line)(emit_t *emit, mp_uint_t line);
mp_emit_method_table_id_ops_t load_id;
mp_emit_method_table_id_ops_t store_id;
mp_emit_method_table_id_ops_t delete_id;
void (*label_assign)(emit_t *emit, mp_uint_t l);
void (*import)(emit_t *emit, qstr qst, int kind);
void (*load_const_tok)(emit_t *emit, mp_token_kind_t tok);
void (*load_const_small_int)(emit_t *emit, mp_int_t arg);
void (*load_const_str)(emit_t *emit, qstr qst);
void (*load_const_obj)(emit_t *emit, mp_obj_t obj);
void (*load_null)(emit_t *emit);
void (*load_method)(emit_t *emit, qstr qst, bool is_super);
void (*load_build_class)(emit_t *emit);
void (*subscr)(emit_t *emit, int kind);
void (*attr)(emit_t *emit, qstr qst, int kind);
void (*dup_top)(emit_t *emit);
void (*dup_top_two)(emit_t *emit);
void (*pop_top)(emit_t *emit);
void (*rot_two)(emit_t *emit);
void (*rot_three)(emit_t *emit);
void (*jump)(emit_t *emit, mp_uint_t label);
void (*pop_jump_if)(emit_t *emit, bool cond, mp_uint_t label);
void (*jump_if_or_pop)(emit_t *emit, bool cond, mp_uint_t label);
void (*unwind_jump)(emit_t *emit, mp_uint_t label, mp_uint_t except_depth);
void (*setup_block)(emit_t *emit, mp_uint_t label, int kind);
void (*with_cleanup)(emit_t *emit, mp_uint_t label);
void (*end_finally)(emit_t *emit);
void (*get_iter)(emit_t *emit, bool use_stack);
void (*for_iter)(emit_t *emit, mp_uint_t label);
void (*for_iter_end)(emit_t *emit);
void (*pop_except_jump)(emit_t *emit, mp_uint_t label, bool within_exc_handler);
void (*unary_op)(emit_t *emit, mp_unary_op_t op);
void (*binary_op)(emit_t *emit, mp_binary_op_t op);
void (*build)(emit_t *emit, mp_uint_t n_args, int kind);
void (*store_map)(emit_t *emit);
void (*store_comp)(emit_t *emit, scope_kind_t kind, mp_uint_t set_stack_index);
void (*unpack_sequence)(emit_t *emit, mp_uint_t n_args);
void (*unpack_ex)(emit_t *emit, mp_uint_t n_left, mp_uint_t n_right);
void (*make_function)(emit_t *emit, scope_t *scope, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults);
void (*make_closure)(emit_t *emit, scope_t *scope, mp_uint_t n_closed_over, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults);
void (*call_function)(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags);
void (*call_method)(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags);
void (*return_value)(emit_t *emit);
void (*raise_varargs)(emit_t *emit, mp_uint_t n_args);
void (*yield)(emit_t *emit, int kind);
// these methods are used to control entry to/exit from an exception handler
// they may or may not emit code
void (*start_except_handler)(emit_t *emit);
void (*end_except_handler)(emit_t *emit);
} emit_method_table_t;
py: Rework bytecode and .mpy file format to be mostly static data. Background: .mpy files are precompiled .py files, built using mpy-cross, that contain compiled bytecode functions (and can also contain machine code). The benefit of using an .mpy file over a .py file is that they are faster to import and take less memory when importing. They are also smaller on disk. But the real benefit of .mpy files comes when they are frozen into the firmware. This is done by loading the .mpy file during compilation of the firmware and turning it into a set of big C data structures (the job of mpy-tool.py), which are then compiled and downloaded into the ROM of a device. These C data structures can be executed in-place, ie directly from ROM. This makes importing even faster because there is very little to do, and also means such frozen modules take up much less RAM (because their bytecode stays in ROM). The downside of frozen code is that it requires recompiling and reflashing the entire firmware. This can be a big barrier to entry, slows down development time, and makes it harder to do OTA updates of frozen code (because the whole firmware must be updated). This commit attempts to solve this problem by providing a solution that sits between loading .mpy files into RAM and freezing them into the firmware. The .mpy file format has been reworked so that it consists of data and bytecode which is mostly static and ready to run in-place. If these new .mpy files are located in flash/ROM which is memory addressable, the .mpy file can be executed (mostly) in-place. With this approach there is still a small amount of unpacking and linking of the .mpy file that needs to be done when it's imported, but it's still much better than loading an .mpy from disk into RAM (although not as good as freezing .mpy files into the firmware). The main trick to make static .mpy files is to adjust the bytecode so any qstrs that it references now go through a lookup table to convert from local qstr number in the module to global qstr number in the firmware. That means the bytecode does not need linking/rewriting of qstrs when it's loaded. Instead only a small qstr table needs to be built (and put in RAM) at import time. This means the bytecode itself is static/constant and can be used directly if it's in addressable memory. Also the qstr string data in the .mpy file, and some constant object data, can be used directly. Note that the qstr table is global to the module (ie not per function). In more detail, in the VM what used to be (schematically): qst = DECODE_QSTR_VALUE; is now (schematically): idx = DECODE_QSTR_INDEX; qst = qstr_table[idx]; That allows the bytecode to be fixed at compile time and not need relinking/rewriting of the qstr values. Only qstr_table needs to be linked when the .mpy is loaded. Incidentally, this helps to reduce the size of bytecode because what used to be 2-byte qstr values in the bytecode are now (mostly) 1-byte indices. If the module uses the same qstr more than two times then the bytecode is smaller than before. The following changes are measured for this commit compared to the previous (the baseline): - average 7%-9% reduction in size of .mpy files - frozen code size is reduced by about 5%-7% - importing .py files uses about 5% less RAM in total - importing .mpy files uses about 4% less RAM in total - importing .py and .mpy files takes about the same time as before The qstr indirection in the bytecode has only a small impact on VM performance. For stm32 on PYBv1.0 the performance change of this commit is: diff of scores (higher is better) N=100 M=100 baseline -> this-commit diff diff% (error%) bm_chaos.py 371.07 -> 357.39 : -13.68 = -3.687% (+/-0.02%) bm_fannkuch.py 78.72 -> 77.49 : -1.23 = -1.563% (+/-0.01%) bm_fft.py 2591.73 -> 2539.28 : -52.45 = -2.024% (+/-0.00%) bm_float.py 6034.93 -> 5908.30 : -126.63 = -2.098% (+/-0.01%) bm_hexiom.py 48.96 -> 47.93 : -1.03 = -2.104% (+/-0.00%) bm_nqueens.py 4510.63 -> 4459.94 : -50.69 = -1.124% (+/-0.00%) bm_pidigits.py 650.28 -> 644.96 : -5.32 = -0.818% (+/-0.23%) core_import_mpy_multi.py 564.77 -> 581.49 : +16.72 = +2.960% (+/-0.01%) core_import_mpy_single.py 68.67 -> 67.16 : -1.51 = -2.199% (+/-0.01%) core_qstr.py 64.16 -> 64.12 : -0.04 = -0.062% (+/-0.00%) core_yield_from.py 362.58 -> 354.50 : -8.08 = -2.228% (+/-0.00%) misc_aes.py 429.69 -> 405.59 : -24.10 = -5.609% (+/-0.01%) misc_mandel.py 3485.13 -> 3416.51 : -68.62 = -1.969% (+/-0.00%) misc_pystone.py 2496.53 -> 2405.56 : -90.97 = -3.644% (+/-0.01%) misc_raytrace.py 381.47 -> 374.01 : -7.46 = -1.956% (+/-0.01%) viper_call0.py 576.73 -> 572.49 : -4.24 = -0.735% (+/-0.04%) viper_call1a.py 550.37 -> 546.21 : -4.16 = -0.756% (+/-0.09%) viper_call1b.py 438.23 -> 435.68 : -2.55 = -0.582% (+/-0.06%) viper_call1c.py 442.84 -> 440.04 : -2.80 = -0.632% (+/-0.08%) viper_call2a.py 536.31 -> 532.35 : -3.96 = -0.738% (+/-0.06%) viper_call2b.py 382.34 -> 377.07 : -5.27 = -1.378% (+/-0.03%) And for unix on x64: diff of scores (higher is better) N=2000 M=2000 baseline -> this-commit diff diff% (error%) bm_chaos.py 13594.20 -> 13073.84 : -520.36 = -3.828% (+/-5.44%) bm_fannkuch.py 60.63 -> 59.58 : -1.05 = -1.732% (+/-3.01%) bm_fft.py 112009.15 -> 111603.32 : -405.83 = -0.362% (+/-4.03%) bm_float.py 246202.55 -> 247923.81 : +1721.26 = +0.699% (+/-2.79%) bm_hexiom.py 615.65 -> 617.21 : +1.56 = +0.253% (+/-1.64%) bm_nqueens.py 215807.95 -> 215600.96 : -206.99 = -0.096% (+/-3.52%) bm_pidigits.py 8246.74 -> 8422.82 : +176.08 = +2.135% (+/-3.64%) misc_aes.py 16133.00 -> 16452.74 : +319.74 = +1.982% (+/-1.50%) misc_mandel.py 128146.69 -> 130796.43 : +2649.74 = +2.068% (+/-3.18%) misc_pystone.py 83811.49 -> 83124.85 : -686.64 = -0.819% (+/-1.03%) misc_raytrace.py 21688.02 -> 21385.10 : -302.92 = -1.397% (+/-3.20%) The code size change is (firmware with a lot of frozen code benefits the most): bare-arm: +396 +0.697% minimal x86: +1595 +0.979% [incl +32(data)] unix x64: +2408 +0.470% [incl +800(data)] unix nanbox: +1396 +0.309% [incl -96(data)] stm32: -1256 -0.318% PYBV10 cc3200: +288 +0.157% esp8266: -260 -0.037% GENERIC esp32: -216 -0.014% GENERIC[incl -1072(data)] nrf: +116 +0.067% pca10040 rp2: -664 -0.135% PICO samd: +844 +0.607% ADAFRUIT_ITSYBITSY_M4_EXPRESS As part of this change the .mpy file format version is bumped to version 6. And mpy-tool.py has been improved to provide a good visualisation of the contents of .mpy files. In summary: this commit changes the bytecode to use qstr indirection, and reworks the .mpy file format to be simpler and allow .mpy files to be executed in-place. Performance is not impacted too much. Eventually it will be possible to store such .mpy files in a linear, read-only, memory- mappable filesystem so they can be executed from flash/ROM. This will essentially be able to replace frozen code for most applications. Signed-off-by: Damien George <damien@micropython.org>
2021-10-22 07:22:47 -04:00
#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
qstr_short_t mp_emit_common_use_qstr(mp_emit_common_t *emit, qstr qst);
#else
static inline qstr_short_t mp_emit_common_use_qstr(mp_emit_common_t *emit, qstr qst) {
return qst;
}
#endif
static inline size_t mp_emit_common_alloc_const_obj(mp_emit_common_t *emit, mp_obj_t obj) {
if (emit->pass == MP_PASS_EMIT) {
emit->const_table[emit->ct_cur_obj] = (mp_uint_t)obj;
}
return emit->ct_cur_obj++;
}
static inline size_t mp_emit_common_alloc_const_child(mp_emit_common_t *emit, mp_raw_code_t *rc) {
if (emit->pass == MP_PASS_EMIT) {
emit->children[emit->ct_cur_child] = rc;
}
return emit->ct_cur_child++;
}
static inline void mp_emit_common_get_id_for_load(scope_t *scope, qstr qst) {
scope_find_or_add_id(scope, qst, ID_INFO_KIND_GLOBAL_IMPLICIT);
}
void mp_emit_common_get_id_for_modification(scope_t *scope, qstr qst);
void mp_emit_common_id_op(emit_t *emit, const mp_emit_method_table_id_ops_t *emit_method_table, scope_t *scope, qstr qst);
extern const emit_method_table_t emit_bc_method_table;
extern const emit_method_table_t emit_native_x64_method_table;
extern const emit_method_table_t emit_native_x86_method_table;
extern const emit_method_table_t emit_native_thumb_method_table;
2014-08-16 16:55:53 -04:00
extern const emit_method_table_t emit_native_arm_method_table;
extern const emit_method_table_t emit_native_xtensa_method_table;
extern const emit_method_table_t emit_native_xtensawin_method_table;
extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_load_id_ops;
extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_store_id_ops;
extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_delete_id_ops;
py: Rework bytecode and .mpy file format to be mostly static data. Background: .mpy files are precompiled .py files, built using mpy-cross, that contain compiled bytecode functions (and can also contain machine code). The benefit of using an .mpy file over a .py file is that they are faster to import and take less memory when importing. They are also smaller on disk. But the real benefit of .mpy files comes when they are frozen into the firmware. This is done by loading the .mpy file during compilation of the firmware and turning it into a set of big C data structures (the job of mpy-tool.py), which are then compiled and downloaded into the ROM of a device. These C data structures can be executed in-place, ie directly from ROM. This makes importing even faster because there is very little to do, and also means such frozen modules take up much less RAM (because their bytecode stays in ROM). The downside of frozen code is that it requires recompiling and reflashing the entire firmware. This can be a big barrier to entry, slows down development time, and makes it harder to do OTA updates of frozen code (because the whole firmware must be updated). This commit attempts to solve this problem by providing a solution that sits between loading .mpy files into RAM and freezing them into the firmware. The .mpy file format has been reworked so that it consists of data and bytecode which is mostly static and ready to run in-place. If these new .mpy files are located in flash/ROM which is memory addressable, the .mpy file can be executed (mostly) in-place. With this approach there is still a small amount of unpacking and linking of the .mpy file that needs to be done when it's imported, but it's still much better than loading an .mpy from disk into RAM (although not as good as freezing .mpy files into the firmware). The main trick to make static .mpy files is to adjust the bytecode so any qstrs that it references now go through a lookup table to convert from local qstr number in the module to global qstr number in the firmware. That means the bytecode does not need linking/rewriting of qstrs when it's loaded. Instead only a small qstr table needs to be built (and put in RAM) at import time. This means the bytecode itself is static/constant and can be used directly if it's in addressable memory. Also the qstr string data in the .mpy file, and some constant object data, can be used directly. Note that the qstr table is global to the module (ie not per function). In more detail, in the VM what used to be (schematically): qst = DECODE_QSTR_VALUE; is now (schematically): idx = DECODE_QSTR_INDEX; qst = qstr_table[idx]; That allows the bytecode to be fixed at compile time and not need relinking/rewriting of the qstr values. Only qstr_table needs to be linked when the .mpy is loaded. Incidentally, this helps to reduce the size of bytecode because what used to be 2-byte qstr values in the bytecode are now (mostly) 1-byte indices. If the module uses the same qstr more than two times then the bytecode is smaller than before. The following changes are measured for this commit compared to the previous (the baseline): - average 7%-9% reduction in size of .mpy files - frozen code size is reduced by about 5%-7% - importing .py files uses about 5% less RAM in total - importing .mpy files uses about 4% less RAM in total - importing .py and .mpy files takes about the same time as before The qstr indirection in the bytecode has only a small impact on VM performance. For stm32 on PYBv1.0 the performance change of this commit is: diff of scores (higher is better) N=100 M=100 baseline -> this-commit diff diff% (error%) bm_chaos.py 371.07 -> 357.39 : -13.68 = -3.687% (+/-0.02%) bm_fannkuch.py 78.72 -> 77.49 : -1.23 = -1.563% (+/-0.01%) bm_fft.py 2591.73 -> 2539.28 : -52.45 = -2.024% (+/-0.00%) bm_float.py 6034.93 -> 5908.30 : -126.63 = -2.098% (+/-0.01%) bm_hexiom.py 48.96 -> 47.93 : -1.03 = -2.104% (+/-0.00%) bm_nqueens.py 4510.63 -> 4459.94 : -50.69 = -1.124% (+/-0.00%) bm_pidigits.py 650.28 -> 644.96 : -5.32 = -0.818% (+/-0.23%) core_import_mpy_multi.py 564.77 -> 581.49 : +16.72 = +2.960% (+/-0.01%) core_import_mpy_single.py 68.67 -> 67.16 : -1.51 = -2.199% (+/-0.01%) core_qstr.py 64.16 -> 64.12 : -0.04 = -0.062% (+/-0.00%) core_yield_from.py 362.58 -> 354.50 : -8.08 = -2.228% (+/-0.00%) misc_aes.py 429.69 -> 405.59 : -24.10 = -5.609% (+/-0.01%) misc_mandel.py 3485.13 -> 3416.51 : -68.62 = -1.969% (+/-0.00%) misc_pystone.py 2496.53 -> 2405.56 : -90.97 = -3.644% (+/-0.01%) misc_raytrace.py 381.47 -> 374.01 : -7.46 = -1.956% (+/-0.01%) viper_call0.py 576.73 -> 572.49 : -4.24 = -0.735% (+/-0.04%) viper_call1a.py 550.37 -> 546.21 : -4.16 = -0.756% (+/-0.09%) viper_call1b.py 438.23 -> 435.68 : -2.55 = -0.582% (+/-0.06%) viper_call1c.py 442.84 -> 440.04 : -2.80 = -0.632% (+/-0.08%) viper_call2a.py 536.31 -> 532.35 : -3.96 = -0.738% (+/-0.06%) viper_call2b.py 382.34 -> 377.07 : -5.27 = -1.378% (+/-0.03%) And for unix on x64: diff of scores (higher is better) N=2000 M=2000 baseline -> this-commit diff diff% (error%) bm_chaos.py 13594.20 -> 13073.84 : -520.36 = -3.828% (+/-5.44%) bm_fannkuch.py 60.63 -> 59.58 : -1.05 = -1.732% (+/-3.01%) bm_fft.py 112009.15 -> 111603.32 : -405.83 = -0.362% (+/-4.03%) bm_float.py 246202.55 -> 247923.81 : +1721.26 = +0.699% (+/-2.79%) bm_hexiom.py 615.65 -> 617.21 : +1.56 = +0.253% (+/-1.64%) bm_nqueens.py 215807.95 -> 215600.96 : -206.99 = -0.096% (+/-3.52%) bm_pidigits.py 8246.74 -> 8422.82 : +176.08 = +2.135% (+/-3.64%) misc_aes.py 16133.00 -> 16452.74 : +319.74 = +1.982% (+/-1.50%) misc_mandel.py 128146.69 -> 130796.43 : +2649.74 = +2.068% (+/-3.18%) misc_pystone.py 83811.49 -> 83124.85 : -686.64 = -0.819% (+/-1.03%) misc_raytrace.py 21688.02 -> 21385.10 : -302.92 = -1.397% (+/-3.20%) The code size change is (firmware with a lot of frozen code benefits the most): bare-arm: +396 +0.697% minimal x86: +1595 +0.979% [incl +32(data)] unix x64: +2408 +0.470% [incl +800(data)] unix nanbox: +1396 +0.309% [incl -96(data)] stm32: -1256 -0.318% PYBV10 cc3200: +288 +0.157% esp8266: -260 -0.037% GENERIC esp32: -216 -0.014% GENERIC[incl -1072(data)] nrf: +116 +0.067% pca10040 rp2: -664 -0.135% PICO samd: +844 +0.607% ADAFRUIT_ITSYBITSY_M4_EXPRESS As part of this change the .mpy file format version is bumped to version 6. And mpy-tool.py has been improved to provide a good visualisation of the contents of .mpy files. In summary: this commit changes the bytecode to use qstr indirection, and reworks the .mpy file format to be simpler and allow .mpy files to be executed in-place. Performance is not impacted too much. Eventually it will be possible to store such .mpy files in a linear, read-only, memory- mappable filesystem so they can be executed from flash/ROM. This will essentially be able to replace frozen code for most applications. Signed-off-by: Damien George <damien@micropython.org>
2021-10-22 07:22:47 -04:00
emit_t *emit_bc_new(mp_emit_common_t *emit_common);
emit_t *emit_native_x64_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
emit_t *emit_native_x86_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
emit_t *emit_native_thumb_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
emit_t *emit_native_arm_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
emit_t *emit_native_xtensa_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
emit_t *emit_native_xtensawin_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
2013-10-05 18:17:28 -04:00
void emit_bc_set_max_num_labels(emit_t *emit, mp_uint_t max_num_labels);
void emit_bc_free(emit_t *emit);
void emit_native_x64_free(emit_t *emit);
void emit_native_x86_free(emit_t *emit);
void emit_native_thumb_free(emit_t *emit);
2014-08-16 16:55:53 -04:00
void emit_native_arm_free(emit_t *emit);
void emit_native_xtensa_free(emit_t *emit);
void emit_native_xtensawin_free(emit_t *emit);
void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope);
void mp_emit_bc_end_pass(emit_t *emit);
bool mp_emit_bc_last_emit_was_return_value(emit_t *emit);
void mp_emit_bc_adjust_stack_size(emit_t *emit, mp_int_t delta);
void mp_emit_bc_set_source_line(emit_t *emit, mp_uint_t line);
void mp_emit_bc_load_local(emit_t *emit, qstr qst, mp_uint_t local_num, int kind);
void mp_emit_bc_load_global(emit_t *emit, qstr qst, int kind);
void mp_emit_bc_store_local(emit_t *emit, qstr qst, mp_uint_t local_num, int kind);
void mp_emit_bc_store_global(emit_t *emit, qstr qst, int kind);
void mp_emit_bc_delete_local(emit_t *emit, qstr qst, mp_uint_t local_num, int kind);
void mp_emit_bc_delete_global(emit_t *emit, qstr qst, int kind);
void mp_emit_bc_label_assign(emit_t *emit, mp_uint_t l);
void mp_emit_bc_import(emit_t *emit, qstr qst, int kind);
void mp_emit_bc_load_const_tok(emit_t *emit, mp_token_kind_t tok);
void mp_emit_bc_load_const_small_int(emit_t *emit, mp_int_t arg);
void mp_emit_bc_load_const_str(emit_t *emit, qstr qst);
void mp_emit_bc_load_const_obj(emit_t *emit, mp_obj_t obj);
void mp_emit_bc_load_null(emit_t *emit);
void mp_emit_bc_load_method(emit_t *emit, qstr qst, bool is_super);
void mp_emit_bc_load_build_class(emit_t *emit);
void mp_emit_bc_subscr(emit_t *emit, int kind);
void mp_emit_bc_attr(emit_t *emit, qstr qst, int kind);
void mp_emit_bc_dup_top(emit_t *emit);
void mp_emit_bc_dup_top_two(emit_t *emit);
void mp_emit_bc_pop_top(emit_t *emit);
void mp_emit_bc_rot_two(emit_t *emit);
void mp_emit_bc_rot_three(emit_t *emit);
void mp_emit_bc_jump(emit_t *emit, mp_uint_t label);
void mp_emit_bc_pop_jump_if(emit_t *emit, bool cond, mp_uint_t label);
void mp_emit_bc_jump_if_or_pop(emit_t *emit, bool cond, mp_uint_t label);
void mp_emit_bc_unwind_jump(emit_t *emit, mp_uint_t label, mp_uint_t except_depth);
void mp_emit_bc_setup_block(emit_t *emit, mp_uint_t label, int kind);
void mp_emit_bc_with_cleanup(emit_t *emit, mp_uint_t label);
void mp_emit_bc_end_finally(emit_t *emit);
void mp_emit_bc_get_iter(emit_t *emit, bool use_stack);
void mp_emit_bc_for_iter(emit_t *emit, mp_uint_t label);
void mp_emit_bc_for_iter_end(emit_t *emit);
void mp_emit_bc_pop_except_jump(emit_t *emit, mp_uint_t label, bool within_exc_handler);
void mp_emit_bc_unary_op(emit_t *emit, mp_unary_op_t op);
void mp_emit_bc_binary_op(emit_t *emit, mp_binary_op_t op);
void mp_emit_bc_build(emit_t *emit, mp_uint_t n_args, int kind);
void mp_emit_bc_store_map(emit_t *emit);
void mp_emit_bc_store_comp(emit_t *emit, scope_kind_t kind, mp_uint_t list_stack_index);
void mp_emit_bc_unpack_sequence(emit_t *emit, mp_uint_t n_args);
void mp_emit_bc_unpack_ex(emit_t *emit, mp_uint_t n_left, mp_uint_t n_right);
void mp_emit_bc_make_function(emit_t *emit, scope_t *scope, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults);
void mp_emit_bc_make_closure(emit_t *emit, scope_t *scope, mp_uint_t n_closed_over, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults);
void mp_emit_bc_call_function(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags);
void mp_emit_bc_call_method(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags);
void mp_emit_bc_return_value(emit_t *emit);
void mp_emit_bc_raise_varargs(emit_t *emit, mp_uint_t n_args);
void mp_emit_bc_yield(emit_t *emit, int kind);
void mp_emit_bc_start_except_handler(emit_t *emit);
void mp_emit_bc_end_except_handler(emit_t *emit);
2013-10-05 18:17:28 -04:00
typedef struct _emit_inline_asm_t emit_inline_asm_t;
typedef struct _emit_inline_asm_method_table_t {
#if MICROPY_DYNAMIC_COMPILER
emit_inline_asm_t *(*asm_new)(mp_uint_t max_num_labels);
void (*asm_free)(emit_inline_asm_t *emit);
#endif
void (*start_pass)(emit_inline_asm_t *emit, pass_kind_t pass, mp_obj_t *error_slot);
void (*end_pass)(emit_inline_asm_t *emit, mp_uint_t type_sig);
mp_uint_t (*count_params)(emit_inline_asm_t *emit, mp_uint_t n_params, mp_parse_node_t *pn_params);
bool (*label)(emit_inline_asm_t *emit, mp_uint_t label_num, qstr label_id);
void (*op)(emit_inline_asm_t *emit, qstr op, mp_uint_t n_args, mp_parse_node_t *pn_args);
2013-10-05 18:17:28 -04:00
} emit_inline_asm_method_table_t;
extern const emit_inline_asm_method_table_t emit_inline_thumb_method_table;
extern const emit_inline_asm_method_table_t emit_inline_xtensa_method_table;
2013-10-05 18:17:28 -04:00
emit_inline_asm_t *emit_inline_thumb_new(mp_uint_t max_num_labels);
emit_inline_asm_t *emit_inline_xtensa_new(mp_uint_t max_num_labels);
void emit_inline_thumb_free(emit_inline_asm_t *emit);
void emit_inline_xtensa_free(emit_inline_asm_t *emit);
#if MICROPY_WARNINGS
void mp_emitter_warning(pass_kind_t pass, const char *msg);
#else
#define mp_emitter_warning(pass, msg)
#endif
#endif // MICROPY_INCLUDED_PY_EMIT_H