2020-07-27 09:52:38 -04:00
|
|
|
# Test that native code loaded from a .mpy file is retained after a GC.
|
|
|
|
|
|
|
|
try:
|
2023-08-22 11:15:46 -04:00
|
|
|
import gc, sys, io, os
|
2020-07-27 09:52:38 -04:00
|
|
|
|
2022-05-16 19:42:39 -04:00
|
|
|
sys.implementation._mpy
|
2023-08-22 11:15:46 -04:00
|
|
|
io.IOBase
|
|
|
|
os.mount
|
2020-07-27 09:52:38 -04:00
|
|
|
except (ImportError, AttributeError):
|
|
|
|
print("SKIP")
|
|
|
|
raise SystemExit
|
|
|
|
|
|
|
|
|
2023-08-22 11:15:46 -04:00
|
|
|
class UserFile(io.IOBase):
|
2020-07-27 09:52:38 -04:00
|
|
|
def __init__(self, data):
|
|
|
|
self.data = memoryview(data)
|
|
|
|
self.pos = 0
|
|
|
|
|
|
|
|
def readinto(self, buf):
|
|
|
|
n = min(len(buf), len(self.data) - self.pos)
|
|
|
|
buf[:n] = self.data[self.pos : self.pos + n]
|
|
|
|
self.pos += n
|
|
|
|
return n
|
|
|
|
|
|
|
|
def ioctl(self, req, arg):
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
class UserFS:
|
|
|
|
def __init__(self, files):
|
|
|
|
self.files = files
|
|
|
|
|
|
|
|
def mount(self, readonly, mksfs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def umount(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def stat(self, path):
|
|
|
|
if path in self.files:
|
|
|
|
return (32768, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
|
|
|
raise OSError
|
|
|
|
|
|
|
|
def open(self, path, mode):
|
|
|
|
return UserFile(self.files[path])
|
|
|
|
|
|
|
|
|
|
|
|
# Pre-compiled examples/natmod/features0 example for various architectures, keyed
|
2022-09-17 09:57:12 -04:00
|
|
|
# by the required value of sys.implementation._mpy (without sub-version).
|
py/dynruntime.h: Implement MP_OBJ_NEW_QSTR.
Because mpy_ld.py doesn't know the target object representation, it emits
instances of `MP_OBJ_NEW_QSTR(MP_QSTR_Foo)` as const string objects, rather
than qstrs. However this doesn't work for map keys (e.g. for a locals dict)
because the map has all_keys_are_qstrs flag is set (and also auto-complete
requires the map keys to be qstrs).
Instead, emit them as regular qstrs, and make a functioning MP_OBJ_NEW_QSTR
function available (via `native_to_obj`, also used for e.g. making
integers).
Remove the code from mpy_ld.py to emit qstrs as constant strings, but leave
behind the scaffold to emit constant objects in case we want to do use this
in the future.
Strictly this should be a .mpy sub-version bump, even though the function
table isn't changing, it does lead to a change in behavior for a new .mpy
running against old MicroPython. `mp_native_to_obj` will incorrectly return
the qstr value directly as an `mp_obj_t`, leading to unexpected results.
But given that it's broken at the moment, it seems unlikely that anyone is
relying on this, so it's not work the other downsides of a sub-version bump
(i.e. breaking pure-Python modules that use @native). The opposite case of
running an old .mpy on new MicroPython is unchanged, and remains broken in
exactly the same way.
This work was funded through GitHub Sponsors.
Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
2023-08-25 01:43:50 -04:00
|
|
|
# cd examples/natmod/features0
|
|
|
|
# make clean
|
|
|
|
# make ARCH=x64 # or ARCH=armv6m
|
|
|
|
# cat features0.mpy | python -c 'import sys; print(sys.stdin.buffer.read())'
|
2020-07-27 09:52:38 -04:00
|
|
|
features0_file_contents = {
|
all: Remove MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE.
This commit removes all parts of code associated with the existing
MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE optimisation option, including the
-mcache-lookup-bc option to mpy-cross.
This feature originally provided a significant performance boost for Unix,
but wasn't able to be enabled for MCU targets (due to frozen bytecode), and
added significant extra complexity to generating and distributing .mpy
files.
The equivalent performance gain is now provided by the combination of
MICROPY_OPT_LOAD_ATTR_FAST_PATH and MICROPY_OPT_MAP_LOOKUP_CACHE (which has
been enabled on the unix port in the previous commit).
It's hard to provide precise performance numbers, but tests have been run
on a wide variety of architectures (x86-64, ARM Cortex, Aarch64, RISC-V,
xtensa) and they all generally agree on the qualitative improvements seen
by the combination of MICROPY_OPT_LOAD_ATTR_FAST_PATH and
MICROPY_OPT_MAP_LOOKUP_CACHE.
For example, on a "quiet" Linux x64 environment (i3-5010U @ 2.10GHz) the
change from CACHE_MAP_LOOKUP_IN_BYTECODE, to LOAD_ATTR_FAST_PATH combined
with MAP_LOOKUP_CACHE is:
diff of scores (higher is better)
N=2000 M=2000 bccache -> attrmapcache diff diff% (error%)
bm_chaos.py 13742.56 -> 13905.67 : +163.11 = +1.187% (+/-3.75%)
bm_fannkuch.py 60.13 -> 61.34 : +1.21 = +2.012% (+/-2.11%)
bm_fft.py 113083.20 -> 114793.68 : +1710.48 = +1.513% (+/-1.57%)
bm_float.py 256552.80 -> 243908.29 : -12644.51 = -4.929% (+/-1.90%)
bm_hexiom.py 521.93 -> 625.41 : +103.48 = +19.826% (+/-0.40%)
bm_nqueens.py 197544.25 -> 217713.12 : +20168.87 = +10.210% (+/-3.01%)
bm_pidigits.py 8072.98 -> 8198.75 : +125.77 = +1.558% (+/-3.22%)
misc_aes.py 17283.45 -> 16480.52 : -802.93 = -4.646% (+/-0.82%)
misc_mandel.py 99083.99 -> 128939.84 : +29855.85 = +30.132% (+/-5.88%)
misc_pystone.py 83860.10 -> 82592.56 : -1267.54 = -1.511% (+/-2.27%)
misc_raytrace.py 21490.40 -> 22227.23 : +736.83 = +3.429% (+/-1.88%)
This shows that the new optimisations are at least as good as the existing
inline-bytecode-caching, and are sometimes much better (because the new
ones apply caching to a wider variety of map lookups).
The new optimisations can also benefit code generated by the native
emitter, because they apply to the runtime rather than the generated code.
The improvement for the native emitter when LOAD_ATTR_FAST_PATH and
MAP_LOOKUP_CACHE are enabled is (same Linux environment as above):
diff of scores (higher is better)
N=2000 M=2000 native -> nat-attrmapcache diff diff% (error%)
bm_chaos.py 14130.62 -> 15464.68 : +1334.06 = +9.441% (+/-7.11%)
bm_fannkuch.py 74.96 -> 76.16 : +1.20 = +1.601% (+/-1.80%)
bm_fft.py 166682.99 -> 168221.86 : +1538.87 = +0.923% (+/-4.20%)
bm_float.py 233415.23 -> 265524.90 : +32109.67 = +13.756% (+/-2.57%)
bm_hexiom.py 628.59 -> 734.17 : +105.58 = +16.796% (+/-1.39%)
bm_nqueens.py 225418.44 -> 232926.45 : +7508.01 = +3.331% (+/-3.10%)
bm_pidigits.py 6322.00 -> 6379.52 : +57.52 = +0.910% (+/-5.62%)
misc_aes.py 20670.10 -> 27223.18 : +6553.08 = +31.703% (+/-1.56%)
misc_mandel.py 138221.11 -> 152014.01 : +13792.90 = +9.979% (+/-2.46%)
misc_pystone.py 85032.14 -> 105681.44 : +20649.30 = +24.284% (+/-2.25%)
misc_raytrace.py 19800.01 -> 23350.73 : +3550.72 = +17.933% (+/-2.79%)
In summary, compared to MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE, the new
MICROPY_OPT_LOAD_ATTR_FAST_PATH and MICROPY_OPT_MAP_LOOKUP_CACHE options:
- are simpler;
- take less code size;
- are faster (generally);
- work with code generated by the native emitter;
- can be used on embedded targets with a small and constant RAM overhead;
- allow the same .mpy bytecode to run on all targets.
See #7680 for further discussion. And see also #7653 for a discussion
about simplifying mpy-cross options.
Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
2021-09-05 22:28:06 -04:00
|
|
|
# -march=x64
|
2023-10-02 09:59:26 -04:00
|
|
|
0x806: b'C\x06\t\x1f\x02\x004build/features0.native.mpy\x00\x12factorial\x00\x8a\x02\xe9/\x00\x00\x00SH\x8b\x1d\x83\x00\x00\x00\xbe\x02\x00\x00\x00\xffS\x18\xbf\x01\x00\x00\x00H\x85\xc0u\x0cH\x8bC \xbe\x02\x00\x00\x00[\xff\xe0H\x0f\xaf\xf8H\xff\xc8\xeb\xe6ATUSH\x8b\x1dQ\x00\x00\x00H\x8bG\x08L\x8bc(H\x8bx\x08A\xff\xd4H\x8d5+\x00\x00\x00H\x89\xc5H\x8b\x059\x00\x00\x00\x0f\xb7x\x02\xffShH\x89\xefA\xff\xd4H\x8b\x03[]A\\\xc3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 \x11$\r&\xa5 \x01"\xff',
|
2022-05-25 22:23:49 -04:00
|
|
|
# -march=armv6m
|
2023-10-02 09:59:26 -04:00
|
|
|
0x1006: b'C\x06\x11\x1f\x02\x004build/features0.native.mpy\x00\x12factorial\x00\x88"\x1a\xe0\x00\x00\x13\xb5\nK\nJ{D\x9cX\x02!\xe3h\x01\x93\x98G\x03\x00\x01 \x00+\x02\xd0XC\x01;\xfa\xe7#i\x02!\x01\x93\x98G\x16\xbd\xc0Fn\x00\x00\x00\x00\x00\x00\x00\xf7\xb5\nN\nK~D\xf4XChgiXh\xb8G\x05\x00\x07K\x08I\xf3XyDX\x88\x01\x93ck\x98G(\x00\xb8G h\xfe\xbd:\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 \x11>\r@\xa5:\x01<\xff',
|
2020-07-27 09:52:38 -04:00
|
|
|
}
|
|
|
|
|
2022-05-25 22:23:49 -04:00
|
|
|
# Populate armv7m-derived archs based on armv6m.
|
|
|
|
for arch in (0x1406, 0x1806, 0x1C06, 0x2006):
|
|
|
|
features0_file_contents[arch] = features0_file_contents[0x1006]
|
2020-07-27 09:52:38 -04:00
|
|
|
|
2022-09-17 09:57:12 -04:00
|
|
|
# Check that a .mpy exists for the target (ignore sub-version in lookup).
|
|
|
|
sys_implementation_mpy = sys.implementation._mpy & ~(3 << 8)
|
|
|
|
if sys_implementation_mpy not in features0_file_contents:
|
2020-07-27 09:52:38 -04:00
|
|
|
print("SKIP")
|
|
|
|
raise SystemExit
|
|
|
|
|
|
|
|
# These are the test .mpy files.
|
2022-09-17 09:57:12 -04:00
|
|
|
user_files = {"/features0.mpy": features0_file_contents[sys_implementation_mpy]}
|
2020-07-27 09:52:38 -04:00
|
|
|
|
|
|
|
# Create and mount a user filesystem.
|
2023-08-22 11:15:46 -04:00
|
|
|
os.mount(UserFS(user_files), "/userfs")
|
2020-07-27 09:52:38 -04:00
|
|
|
sys.path.append("/userfs")
|
|
|
|
|
|
|
|
# Import the native function.
|
|
|
|
gc.collect()
|
|
|
|
from features0 import factorial
|
|
|
|
|
|
|
|
# Free the module that contained the function.
|
|
|
|
del sys.modules["features0"]
|
|
|
|
|
|
|
|
# Run a GC cycle which should reclaim the module but not the function.
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
# Allocate lots of fragmented memory to overwrite anything that was just freed by the GC.
|
|
|
|
for i in range(1000):
|
|
|
|
[]
|
|
|
|
|
|
|
|
# Run the native function, it should not have been freed or overwritten.
|
|
|
|
print(factorial(10))
|
|
|
|
|
|
|
|
# Unmount and undo path addition.
|
2023-08-22 11:15:46 -04:00
|
|
|
os.umount("/userfs")
|
2020-07-27 09:52:38 -04:00
|
|
|
sys.path.pop()
|