From 607e4a905ae8654bd8450ae137ffa24367039277 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Sun, 11 Oct 2020 15:19:55 -0500 Subject: [PATCH 1/3] build: parallelize the creation of qstr.i.last Rather than simply invoking gcc in preprocessor mode with a list of files, use a Python script with the (python3) ThreadPoolExecutor to invoke the preprocessor in parallel. The amount of concurrency is the number of system CPUs, not the makefile "-j" parallelism setting, because there is no simple and correct way for a Python program to correctly work together with make's idea of parallelism. This reduces the build time of stm32f405 feather (a non-LTO build) from 16s to 12s on my 16-thread Ryzen machine. --- py/genlast.py | 38 ++++++++++++++++++++++++++++++++++++++ py/mkrules.mk | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 py/genlast.py diff --git a/py/genlast.py b/py/genlast.py new file mode 100644 index 0000000000..e14e7ef1e0 --- /dev/null +++ b/py/genlast.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import sys +from concurrent.futures import ThreadPoolExecutor +import multiprocessing +import threading +import subprocess + + +def checkoutput1(args): + info = subprocess.run(args, check=True, stdout=subprocess.PIPE, input='') + return info.stdout + +idx1 = sys.argv.index('--') +idx2 = sys.argv.index('--', idx1+1) +check = sys.argv[1:idx1] +always = sys.argv[idx1+1:idx2] +command = sys.argv[idx2+1:] + +output_lock = threading.Lock() +def preprocess(fn): + output = checkoutput1(command + [fn]) + # Ensure our output doesn't interleave with others + # a threading.Lock is not a context manager object :( + try: + output_lock.acquire() + sys.stdout.buffer.write(output) + finally: + output_lock.release() + +def maybe_preprocess(fn): + if subprocess.call(["grep", "-lqE", "(MP_QSTR|translate)", fn]) == 0: + preprocess(fn) + +executor = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count() + 1) +executor.map(maybe_preprocess, check) +executor.map(preprocess, always) +executor.shutdown() diff --git a/py/mkrules.mk b/py/mkrules.mk index 0b3ecc0e17..84e16ec960 100644 --- a/py/mkrules.mk +++ b/py/mkrules.mk @@ -78,7 +78,7 @@ $(OBJ): | $(HEADER_BUILD)/qstrdefs.enum.h $(HEADER_BUILD)/mpversion.h # - else, process all source files ($^) [this covers "make -B" which can set $? to empty] $(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(SRC_QSTR_PREPROCESSOR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h $(STEPECHO) "GEN $@" - $(Q)grep -lE "(MP_QSTR|translate)" $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) | xargs $(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) $(SRC_QSTR_PREPROCESSOR) >$(HEADER_BUILD)/qstr.i.last; + $(Q)$(PYTHON3) $(PY_SRC)/genlast.py $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) -- $(SRC_QSTR_PREPROCESSOR) -- $(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) >$(HEADER_BUILD)/qstr.i.last; $(HEADER_BUILD)/qstr.split: $(HEADER_BUILD)/qstr.i.last $(PY_SRC)/makeqstrdefs.py $(STEPECHO) "GEN $@" From 479552ce564410f2d96855f5dbca223317c4cbe0 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Sun, 11 Oct 2020 16:03:46 -0500 Subject: [PATCH 2/3] build: Make genlast write the "split" files This gets a further speedup of about 2s (12s -> 9.5s elapsed build time) for stm32f405_feather For what are probably historical reasons, the qstr process involves preprocessing a large number of source files into a single "qstr.i.last" file, then reading this and splitting it into one "qstr" file for each original source ("*.c") file. By eliminating the step of writing qstr.i.last as well as making the regular-expression-matching part be parallelized, build speed is further improved. Because the step to build QSTR_DEFS_COLLECTED does not access qstr.i.last, the path is replaced with "-" in the Makefile. --- py/genlast.py | 77 ++++++++++++++++++++++++++++++++++++--------------- py/mkrules.mk | 10 ++----- 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/py/genlast.py b/py/genlast.py index e14e7ef1e0..1df2a24825 100644 --- a/py/genlast.py +++ b/py/genlast.py @@ -1,38 +1,71 @@ #!/usr/bin/env python3 import sys -from concurrent.futures import ThreadPoolExecutor +import re +import os +import itertools +from concurrent.futures import ProcessPoolExecutor import multiprocessing import threading import subprocess +from makeqstrdefs import qstr_unescape, QSTRING_BLACK_LIST + +re_line = re.compile(r"#[line]*\s(\d+)\s\"([^\"]+)\"", re.DOTALL) +re_qstr = re.compile(r'MP_QSTR_[_a-zA-Z0-9]+', re.DOTALL) +re_translate = re.compile(r'translate\(\"((?:(?=(\\?))\2.)*?)\"\)', re.DOTALL) + +def write_out(fname, output_dir, output): + if output: + for m, r in [("/", "__"), ("\\", "__"), (":", "@"), ("..", "@@")]: + fname = fname.replace(m, r) + with open(output_dir + "/" + fname + ".qstr", "w") as f: + f.write("\n".join(output) + "\n") + +def process_file(fname, output_dir, content): + content = content.decode('utf-8', errors='ignore') + output = [] + for match in re_qstr.findall(content): + name = match.replace('MP_QSTR_', '') + if name not in QSTRING_BLACK_LIST: + output.append('Q(' + qstr_unescape(name) + ')') + for match in re_translate.findall(content): + output.append('TRANSLATE("' + match[0] + '")') + + write_out(fname, output_dir, output) + def checkoutput1(args): info = subprocess.run(args, check=True, stdout=subprocess.PIPE, input='') return info.stdout -idx1 = sys.argv.index('--') -idx2 = sys.argv.index('--', idx1+1) -check = sys.argv[1:idx1] -always = sys.argv[idx1+1:idx2] -command = sys.argv[idx2+1:] - -output_lock = threading.Lock() -def preprocess(fn): - output = checkoutput1(command + [fn]) - # Ensure our output doesn't interleave with others - # a threading.Lock is not a context manager object :( +def preprocess(command, output_dir, fn): try: - output_lock.acquire() - sys.stdout.buffer.write(output) - finally: - output_lock.release() + output = checkoutput1(command + [fn]) + process_file(fn, output_dir, output) + except Exception as e: + print(e, file=sys.stderr) -def maybe_preprocess(fn): +def maybe_preprocess(command, output_dir, fn): if subprocess.call(["grep", "-lqE", "(MP_QSTR|translate)", fn]) == 0: - preprocess(fn) + preprocess(command, output_dir, fn) -executor = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count() + 1) -executor.map(maybe_preprocess, check) -executor.map(preprocess, always) -executor.shutdown() +if __name__ == '__main__': + + + idx1 = sys.argv.index('--') + idx2 = sys.argv.index('--', idx1+1) + output_dir = sys.argv[1] + check = sys.argv[2:idx1] + always = sys.argv[idx1+1:idx2] + command = sys.argv[idx2+1:] + + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + # Mac and Windows use 'spawn'. Uncomment this during testing to catch spawn-specific problems on Linux. + #multiprocessing.set_start_method("spawn") + executor = ProcessPoolExecutor(max_workers=multiprocessing.cpu_count() + 1) + executor.map(maybe_preprocess, itertools.repeat(command), itertools.repeat(output_dir), check) + executor.map(preprocess, itertools.repeat(command), itertools.repeat(output_dir), always) + executor.shutdown() diff --git a/py/mkrules.mk b/py/mkrules.mk index 84e16ec960..549b0e5e42 100644 --- a/py/mkrules.mk +++ b/py/mkrules.mk @@ -76,18 +76,14 @@ $(OBJ): | $(HEADER_BUILD)/qstrdefs.enum.h $(HEADER_BUILD)/mpversion.h # - if anything in QSTR_GLOBAL_DEPENDENCIES is newer, then process all source files ($^) # - else, if list of newer prerequisites ($?) is not empty, then process just these ($?) # - else, process all source files ($^) [this covers "make -B" which can set $? to empty] -$(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(SRC_QSTR_PREPROCESSOR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h +$(HEADER_BUILD)/qstr.split: $(SRC_QSTR) $(SRC_QSTR_PREPROCESSOR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h $(PY_SRC)/genlast.py $(STEPECHO) "GEN $@" - $(Q)$(PYTHON3) $(PY_SRC)/genlast.py $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) -- $(SRC_QSTR_PREPROCESSOR) -- $(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) >$(HEADER_BUILD)/qstr.i.last; - -$(HEADER_BUILD)/qstr.split: $(HEADER_BUILD)/qstr.i.last $(PY_SRC)/makeqstrdefs.py - $(STEPECHO) "GEN $@" - $(Q)$(PYTHON3) $(PY_SRC)/makeqstrdefs.py split $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED) + $(Q)$(PYTHON3) $(PY_SRC)/genlast.py $(HEADER_BUILD)/qstr $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) -- $(SRC_QSTR_PREPROCESSOR) -- $(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) $(Q)touch $@ $(QSTR_DEFS_COLLECTED): $(HEADER_BUILD)/qstr.split $(PY_SRC)/makeqstrdefs.py $(STEPECHO) "GEN $@" - $(Q)$(PYTHON3) $(PY_SRC)/makeqstrdefs.py cat $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED) + $(Q)$(PYTHON3) $(PY_SRC)/makeqstrdefs.py cat - $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED) # $(sort $(var)) removes duplicates # From 8ddbebd2c1500ceba4b6628882f5a128743d323d Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Mon, 12 Oct 2020 13:35:15 -0500 Subject: [PATCH 3/3] esp32s2: fix build after qstr rule changes --- ports/esp32s2/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ports/esp32s2/Makefile b/ports/esp32s2/Makefile index a97ab0b00f..8d891edd02 100644 --- a/ports/esp32s2/Makefile +++ b/ports/esp32s2/Makefile @@ -269,7 +269,7 @@ menuconfig: $(BUILD)/esp-idf/config $(Q)grep -Fvxf $(DEBUG_SDKCONFIG) -f $(FLASH_SDKCONFIG) $(BUILD)/sdkconfig.diff > boards/$(BOARD)/sdkconfig # qstr builds include headers so we need to make sure they are up to date -$(HEADER_BUILD)/qstr.i.last: | $(BUILD)/esp-idf/config/sdkconfig.h +$(HEADER_BUILD)/qstr.split: | $(BUILD)/esp-idf/config/sdkconfig.h ESP_IDF_COMPONENTS_LINK = freertos log hal esp_system esp_adc_cal esp32s2 bootloader_support pthread esp_timer vfs spi_flash app_update esp_common esp32s2 heap newlib driver xtensa soc esp_ringbuf esp_wifi esp_event wpa_supplicant mbedtls efuse nvs_flash esp_netif lwip esp_rom esp-tls