py/emitnative: Put a pointer to the native prelude in child_table array.

Some architectures (like esp32 xtensa) cannot read byte-wise from
executable memory.  This means the prelude for native functions -- which is
usually located after the machine code for the native function -- must be
placed in separate memory that can be read byte-wise.  Prior to this commit
this was achieved by enabling N_PRELUDE_AS_BYTES_OBJ for the emitter and
MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ for the runtime.  The prelude was
then placed in a bytes object, pointed to by the module's constant table.

This behaviour is changed by this commit so that a pointer to the prelude
is stored either in mp_obj_fun_bc_t.child_table, or in
mp_obj_fun_bc_t.child_table[num_children] if num_children > 0.  The reasons
for doing this are:

1. It decouples the native emitter from runtime requirements, the emitted
   code no longer needs to know if the system it runs on can/can't read
   byte-wise from executable memory.

2. It makes all ports have the same emitter behaviour, there is no longer
   the N_PRELUDE_AS_BYTES_OBJ option.

3. The module's constant table is now used only for actual constants in the
   Python code.  This allows further optimisations to be done with the
   constants (eg constant deduplication).

Code size change for those ports that enable the native emitter:
   unix x64:   +80 +0.015%
      stm32:   +24 +0.004% PYBV10
    esp8266:   +88 +0.013% GENERIC
      esp32:   -20 -0.002% GENERIC[incl -112(data)]
        rp2:   +32 +0.005% PICO

Signed-off-by: Damien George <damien@micropython.org>
This commit is contained in:
Damien George 2022-05-10 13:56:24 +10:00
parent 8725a32f41
commit 1fb01bd6c5
7 changed files with 93 additions and 83 deletions

View File

@ -3031,14 +3031,6 @@ STATIC bool compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
// they will be computed in this first pass // they will be computed in this first pass
scope->stack_size = 0; scope->stack_size = 0;
scope->exc_stack_size = 0; scope->exc_stack_size = 0;
#if MICROPY_EMIT_NATIVE
if (scope->emit_options == MP_EMIT_OPT_NATIVE_PYTHON || scope->emit_options == MP_EMIT_OPT_VIPER) {
// allow native code to perfom basic tasks during the pass scope
// note: the first argument passed here is mp_emit_common_t, not the native emitter context
NATIVE_EMITTER_TABLE->start_pass((void *)&comp->emit_common, comp->pass, scope);
}
#endif
} }
// compile // compile

View File

@ -231,9 +231,7 @@ struct _emit_t {
exc_stack_entry_t *exc_stack; exc_stack_entry_t *exc_stack;
int prelude_offset; int prelude_offset;
#if N_PRELUDE_AS_BYTES_OBJ int prelude_ptr_index;
size_t prelude_const_table_offset;
#endif
int start_offset; int start_offset;
int n_state; int n_state;
uint16_t code_state_start; uint16_t code_state_start;
@ -349,16 +347,6 @@ STATIC void emit_native_mov_reg_qstr_obj(emit_t *emit, int reg_dest, qstr qst) {
STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) { STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
DEBUG_printf("start_pass(pass=%u, scope=%p)\n", pass, scope); DEBUG_printf("start_pass(pass=%u, scope=%p)\n", pass, scope);
if (pass == MP_PASS_SCOPE) {
// Note: the first argument passed here is mp_emit_common_t, not the native emitter context
#if N_PRELUDE_AS_BYTES_OBJ
if (scope->emit_options == MP_EMIT_OPT_NATIVE_PYTHON) {
mp_emit_common_alloc_const_obj((mp_emit_common_t *)emit, mp_const_none);
}
#endif
return;
}
emit->pass = pass; emit->pass = pass;
emit->do_viper_types = scope->emit_options == MP_EMIT_OPT_VIPER; emit->do_viper_types = scope->emit_options == MP_EMIT_OPT_VIPER;
emit->stack_size = 0; emit->stack_size = 0;
@ -511,12 +499,7 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
if (emit->scope->scope_flags & MP_SCOPE_FLAG_GENERATOR) { if (emit->scope->scope_flags & MP_SCOPE_FLAG_GENERATOR) {
emit->code_state_start = 0; emit->code_state_start = 0;
emit->stack_start = SIZEOF_CODE_STATE; emit->stack_start = SIZEOF_CODE_STATE;
#if N_PRELUDE_AS_BYTES_OBJ mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_ptr_index);
// Load index of prelude bytes object in const_table
mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_const_table_offset);
#else
mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_offset);
#endif
mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->start_offset); mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->start_offset);
ASM_ENTRY(emit->as, SIZEOF_NLR_BUF); ASM_ENTRY(emit->as, SIZEOF_NLR_BUF);
@ -562,41 +545,19 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
// Set code_state.fun_bc // Set code_state.fun_bc
ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_FUN_OBJ(emit), REG_PARENT_ARG_1); ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_FUN_OBJ(emit), REG_PARENT_ARG_1);
// Set code_state.ip, a pointer to the beginning of the prelude // Set code_state.ip, a pointer to the beginning of the prelude. This pointer is found
// either directly in mp_obj_fun_bc_t.child_table (if there are no children), or in
// mp_obj_fun_bc_t.child_table[num_children] (if num_children > 0).
// Need to use some locals for this, so assert that they are available for use // Need to use some locals for this, so assert that they are available for use
MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_1); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_1);
MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_2); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_2);
MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_3); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_3);
MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_4); MP_STATIC_ASSERT(REG_LOCAL_3 != REG_PARENT_ARG_4);
int code_state_ip_local = emit->code_state_start + OFFSETOF_CODE_STATE_IP; ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CHILD_TABLE);
#if N_PRELUDE_AS_BYTES_OBJ if (emit->prelude_ptr_index != 0) {
// Prelude is a bytes object in const_table[prelude_const_table_offset]. ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, emit->prelude_ptr_index);
ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONTEXT);
ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE);
ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, emit->prelude_const_table_offset);
ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, offsetof(mp_obj_str_t, data) / sizeof(uintptr_t));
#else
MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_1);
MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_2);
MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_3);
MP_STATIC_ASSERT(REG_LOCAL_2 != REG_PARENT_ARG_4);
// Prelude is at the end of the machine code
ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_BYTECODE);
if (emit->pass == MP_PASS_CODE_SIZE) {
// Commit to the encoding size based on the value of prelude_offset in this pass.
// By using 32768 as the cut-off it is highly unlikely that prelude_offset will
// grow beyond 65535 by the end of thiss pass, and so require the larger encoding.
emit->prelude_offset_uses_u16_encoding = emit->prelude_offset < 32768;
} }
if (emit->prelude_offset_uses_u16_encoding) { emit_native_mov_state_reg(emit, emit->code_state_start + OFFSETOF_CODE_STATE_IP, REG_LOCAL_3);
assert(emit->prelude_offset <= 65535);
ASM_MOV_REG_IMM_FIX_U16((emit)->as, REG_LOCAL_2, emit->prelude_offset);
} else {
ASM_MOV_REG_IMM_FIX_WORD((emit)->as, REG_LOCAL_2, emit->prelude_offset);
}
ASM_ADD_REG_REG(emit->as, REG_LOCAL_3, REG_LOCAL_2);
#endif
emit_native_mov_state_reg(emit, code_state_ip_local, REG_LOCAL_3);
// Set code_state.n_state (only works on little endian targets due to n_state being uint16_t) // Set code_state.n_state (only works on little endian targets due to n_state being uint16_t)
emit_native_mov_state_imm_via(emit, emit->code_state_start + OFFSETOF_CODE_STATE_N_STATE, emit->n_state, REG_ARG_1); emit_native_mov_state_imm_via(emit, emit->code_state_start + OFFSETOF_CODE_STATE_N_STATE, emit->n_state, REG_ARG_1);
@ -657,6 +618,7 @@ STATIC bool emit_native_end_pass(emit_t *emit) {
if (!emit->do_viper_types) { if (!emit->do_viper_types) {
emit->prelude_offset = mp_asm_base_get_code_pos(&emit->as->base); emit->prelude_offset = mp_asm_base_get_code_pos(&emit->as->base);
emit->prelude_ptr_index = emit->emit_common->ct_cur_child;
size_t n_state = emit->n_state; size_t n_state = emit->n_state;
size_t n_exc_stack = 0; // exc-stack not needed for native code size_t n_exc_stack = 0; // exc-stack not needed for native code
@ -693,16 +655,6 @@ STATIC bool emit_native_end_pass(emit_t *emit) {
} }
emit->n_cell = mp_asm_base_get_code_pos(&emit->as->base) - cell_start; emit->n_cell = mp_asm_base_get_code_pos(&emit->as->base) - cell_start;
#if N_PRELUDE_AS_BYTES_OBJ
// Create the prelude as a bytes object, and store it in the constant table
mp_obj_t prelude = mp_const_none;
if (emit->pass == MP_PASS_EMIT) {
void *buf = emit->as->base.code_base + emit->prelude_offset;
size_t n = emit->as->base.code_offset - emit->prelude_offset;
prelude = mp_obj_new_bytes(buf, n);
}
emit->prelude_const_table_offset = mp_emit_common_alloc_const_obj(emit->emit_common, prelude);
#endif
} }
ASM_END_PASS(emit->as); ASM_END_PASS(emit->as);
@ -725,10 +677,33 @@ STATIC bool emit_native_end_pass(emit_t *emit) {
void *f = mp_asm_base_get_code(&emit->as->base); void *f = mp_asm_base_get_code(&emit->as->base);
mp_uint_t f_len = mp_asm_base_get_code_size(&emit->as->base); mp_uint_t f_len = mp_asm_base_get_code_size(&emit->as->base);
mp_raw_code_t **children = emit->emit_common->children;
if (!emit->do_viper_types) {
#if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
// Executable code cannot be accessed byte-wise on this architecture, so copy
// the prelude to a separate memory region that is byte-wise readable.
void *buf = emit->as->base.code_base + emit->prelude_offset;
size_t n = emit->as->base.code_offset - emit->prelude_offset;
const uint8_t *prelude_ptr = memcpy(m_new(uint8_t, n), buf, n);
#else
// Point to the prelude directly, at the end of the machine code data.
const uint8_t *prelude_ptr = (const uint8_t *)f + emit->prelude_offset;
#endif
// Store the pointer to the prelude using the child_table.
assert(emit->prelude_ptr_index == emit->emit_common->ct_cur_child);
if (emit->prelude_ptr_index == 0) {
children = (void *)prelude_ptr;
} else {
children = m_renew(mp_raw_code_t *, children, emit->prelude_ptr_index, emit->prelude_ptr_index + 1);
children[emit->prelude_ptr_index] = (void *)prelude_ptr;
}
}
mp_emit_glue_assign_native(emit->scope->raw_code, mp_emit_glue_assign_native(emit->scope->raw_code,
emit->do_viper_types ? MP_CODE_NATIVE_VIPER : MP_CODE_NATIVE_PY, emit->do_viper_types ? MP_CODE_NATIVE_VIPER : MP_CODE_NATIVE_PY,
f, f_len, f, f_len,
emit->emit_common->children, children,
#if MICROPY_PERSISTENT_CODE_SAVE #if MICROPY_PERSISTENT_CODE_SAVE
emit->emit_common->ct_cur_child, emit->emit_common->ct_cur_child,
emit->prelude_offset, emit->prelude_offset,

View File

@ -15,7 +15,6 @@
#define NLR_BUF_IDX_LOCAL_3 (2 + 6) // a6 #define NLR_BUF_IDX_LOCAL_3 (2 + 6) // a6
#define N_NLR_SETJMP (1) #define N_NLR_SETJMP (1)
#define N_PRELUDE_AS_BYTES_OBJ (1)
#define N_XTENSAWIN (1) #define N_XTENSAWIN (1)
#define EXPORT_FUN(name) emit_native_xtensawin_##name #define EXPORT_FUN(name) emit_native_xtensawin_##name
#include "py/emitnative.c" #include "py/emitnative.c"

View File

@ -394,8 +394,10 @@
// Convenience definition for whether any native emitter is enabled // Convenience definition for whether any native emitter is enabled
#define MICROPY_EMIT_NATIVE (MICROPY_EMIT_X64 || MICROPY_EMIT_X86 || MICROPY_EMIT_THUMB || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA || MICROPY_EMIT_XTENSAWIN) #define MICROPY_EMIT_NATIVE (MICROPY_EMIT_X64 || MICROPY_EMIT_X86 || MICROPY_EMIT_THUMB || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA || MICROPY_EMIT_XTENSAWIN)
// Select prelude-as-bytes-object for certain emitters // Some architectures cannot read byte-wise from executable memory. In this case
#define MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ (MICROPY_EMIT_XTENSAWIN) // the prelude for a native function (which usually sits after the machine code)
// must be separated and placed somewhere where it can be read byte-wise.
#define MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE (MICROPY_EMIT_XTENSAWIN)
// Convenience definition for whether any inline assembler emitter is enabled // Convenience definition for whether any inline assembler emitter is enabled
#define MICROPY_EMIT_INLINE_ASM (MICROPY_EMIT_INLINE_THUMB || MICROPY_EMIT_INLINE_XTENSA) #define MICROPY_EMIT_INLINE_ASM (MICROPY_EMIT_INLINE_THUMB || MICROPY_EMIT_INLINE_XTENSA)

View File

@ -98,14 +98,13 @@ STATIC mp_obj_t native_gen_wrap_call(mp_obj_t self_in, size_t n_args, size_t n_k
mp_obj_fun_bc_t *self_fun = MP_OBJ_TO_PTR(self_in); mp_obj_fun_bc_t *self_fun = MP_OBJ_TO_PTR(self_in);
// Determine start of prelude. // Determine start of prelude.
uintptr_t prelude_offset = ((uintptr_t *)self_fun->bytecode)[0]; uintptr_t prelude_ptr_index = ((uintptr_t *)self_fun->bytecode)[0];
#if MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ const uint8_t *prelude_ptr;
// Prelude is in bytes object in const_table, at index prelude_offset if (prelude_ptr_index == 0) {
mp_obj_str_t *prelude_bytes = MP_OBJ_TO_PTR(self_fun->context->constants.obj_table[prelude_offset]); prelude_ptr = (void *)self_fun->child_table;
const uint8_t *prelude_ptr = prelude_bytes->data; } else {
#else prelude_ptr = (void *)self_fun->child_table[prelude_ptr_index];
const uint8_t *prelude_ptr = self_fun->bytecode + prelude_offset; }
#endif
// Extract n_state from the prelude. // Extract n_state from the prelude.
const uint8_t *ip = prelude_ptr; const uint8_t *ip = prelude_ptr;

View File

@ -348,7 +348,7 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) {
// Load children if any. // Load children if any.
if (has_children) { if (has_children) {
n_children = read_uint(reader); n_children = read_uint(reader);
children = m_new(mp_raw_code_t *, n_children); children = m_new(mp_raw_code_t *, n_children + (kind == MP_CODE_NATIVE_PY));
for (size_t i = 0; i < n_children; ++i) { for (size_t i = 0; i < n_children; ++i) {
children[i] = load_raw_code(reader); children[i] = load_raw_code(reader);
} }
@ -372,6 +372,17 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) {
#if MICROPY_EMIT_MACHINE_CODE #if MICROPY_EMIT_MACHINE_CODE
} else { } else {
const uint8_t *prelude_ptr;
#if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
if (kind == MP_CODE_NATIVE_PY) {
// Executable code cannot be accessed byte-wise on this architecture, so copy
// the prelude to a separate memory region that is byte-wise readable.
void *buf = fun_data + prelude_offset;
size_t n = fun_data_len - prelude_offset;
prelude_ptr = memcpy(m_new(uint8_t, n), buf, n);
}
#endif
// Relocate and commit code to executable address space // Relocate and commit code to executable address space
reloc_info_t ri = {reader, rodata, bss}; reloc_info_t ri = {reader, rodata, bss};
#if defined(MP_PLAT_COMMIT_EXEC) #if defined(MP_PLAT_COMMIT_EXEC)
@ -395,6 +406,17 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader) {
} }
#endif #endif
if (kind == MP_CODE_NATIVE_PY) {
#if !MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
prelude_ptr = fun_data + prelude_offset;
#endif
if (n_children == 0) {
children = (void *)prelude_ptr;
} else {
children[n_children] = (void *)prelude_ptr;
}
}
// Assign native code to raw code object // Assign native code to raw code object
mp_emit_glue_assign_native(rc, kind, mp_emit_glue_assign_native(rc, kind,
fun_data, fun_data_len, fun_data, fun_data_len,

View File

@ -824,7 +824,7 @@ class RawCode(object):
for rc in self.children: for rc in self.children:
rc.disassemble() rc.disassemble()
def freeze_children(self): def freeze_children(self, prelude_ptr=None):
# Freeze children and generate table of children. # Freeze children and generate table of children.
if len(self.children): if len(self.children):
for rc in self.children: for rc in self.children:
@ -834,10 +834,12 @@ class RawCode(object):
print("static const mp_raw_code_t *const children_%s[] = {" % self.escaped_name) print("static const mp_raw_code_t *const children_%s[] = {" % self.escaped_name)
for rc in self.children: for rc in self.children:
print(" &raw_code_%s," % rc.escaped_name) print(" &raw_code_%s," % rc.escaped_name)
if prelude_ptr:
print(" (void *)%s," % prelude_ptr)
print("};") print("};")
print() print()
def freeze_raw_code(self, qstr_links=(), type_sig=0): def freeze_raw_code(self, prelude_ptr=None, qstr_links=(), type_sig=0):
# Generate mp_raw_code_t. # Generate mp_raw_code_t.
print("static const mp_raw_code_t raw_code_%s = {" % self.escaped_name) print("static const mp_raw_code_t raw_code_%s = {" % self.escaped_name)
print(" .kind = %s," % RawCode.code_kind_str[self.code_kind]) print(" .kind = %s," % RawCode.code_kind_str[self.code_kind])
@ -849,6 +851,8 @@ class RawCode(object):
print(" #endif") print(" #endif")
if len(self.children): if len(self.children):
print(" .children = (void *)&children_%s," % self.escaped_name) print(" .children = (void *)&children_%s," % self.escaped_name)
elif prelude_ptr:
print(" .children = (void *)%s," % prelude_ptr)
else: else:
print(" .children = NULL,") print(" .children = NULL,")
print(" #if MICROPY_PERSISTENT_CODE_SAVE") print(" #if MICROPY_PERSISTENT_CODE_SAVE")
@ -1112,8 +1116,25 @@ class RawCodeNative(RawCode):
print("};") print("};")
self.freeze_children() prelude_ptr = None
self.freeze_raw_code(self.qstr_links, self.type_sig) if self.code_kind == MP_CODE_NATIVE_PY:
prelude_ptr = "fun_data_%s_prelude_macro" % self.escaped_name
print("#if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE")
n = len(self.fun_data) - self.prelude_offset
print("static const byte fun_data_%s_prelude[%u] = {" % (self.escaped_name, n), end="")
for i in range(n):
print(" 0x%02x," % self.fun_data[self.prelude_offset + i], end="")
print("};")
print("#define %s &fun_data_%s_prelude[0]" % (prelude_ptr, self.escaped_name))
print("#else")
print(
"#define %s &fun_data_%s[%u]"
% (prelude_ptr, self.escaped_name, self.prelude_offset)
)
print("#endif")
self.freeze_children(prelude_ptr)
self.freeze_raw_code(prelude_ptr, self.qstr_links, self.type_sig)
class MPYSegment: class MPYSegment: