Merge pull request #2614 from tannewt/gc_multiblock_speedup

Track first free atbs for multiple block sizes instead of just 1
This commit is contained in:
Dan Halbert 2020-02-12 16:04:34 -05:00 committed by GitHub
commit 01fb668904
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 212 additions and 37 deletions

81
py/gc.c
View File

@ -150,9 +150,13 @@ void gc_init(void *start, void *end) {
#endif
// Set first free ATB index to the start of the heap.
MP_STATE_MEM(gc_first_free_atb_index) = 0;
for (size_t i = 0; i < MICROPY_ATB_INDICES; i++) {
MP_STATE_MEM(gc_first_free_atb_index)[i] = 0;
}
// Set last free ATB index to the end of the heap.
MP_STATE_MEM(gc_last_free_atb_index) = MP_STATE_MEM(gc_alloc_table_byte_len) - 1;
// Set the lowest long lived ptr to the end of the heap to start. This will be lowered as long
// lived objects are allocated.
MP_STATE_MEM(gc_lowest_long_lived_ptr) = (void*) PTR_FROM_BLOCK(MP_STATE_MEM(gc_alloc_table_byte_len * BLOCKS_PER_ATB));
@ -387,7 +391,9 @@ void gc_collect_root(void **ptrs, size_t len) {
void gc_collect_end(void) {
gc_deal_with_stack_overflow();
gc_sweep();
MP_STATE_MEM(gc_first_free_atb_index) = 0;
for (size_t i = 0; i < MICROPY_ATB_INDICES; i++) {
MP_STATE_MEM(gc_first_free_atb_index)[i] = 0;
}
MP_STATE_MEM(gc_last_free_atb_index) = MP_STATE_MEM(gc_alloc_table_byte_len) - 1;
MP_STATE_MEM(gc_lock_depth)--;
GC_EXIT();
@ -513,14 +519,16 @@ void *gc_alloc(size_t n_bytes, bool has_finaliser, bool long_lived) {
size_t crossover_block = BLOCK_FROM_PTR(MP_STATE_MEM(gc_lowest_long_lived_ptr));
while (keep_looking) {
int8_t direction = 1;
size_t start = MP_STATE_MEM(gc_first_free_atb_index);
size_t bucket = MIN(n_blocks, MICROPY_ATB_INDICES) - 1;
size_t first_free = MP_STATE_MEM(gc_first_free_atb_index)[bucket];
size_t start = first_free;
if (long_lived) {
direction = -1;
start = MP_STATE_MEM(gc_last_free_atb_index);
}
n_free = 0;
// look for a run of n_blocks available blocks
for (size_t i = start; keep_looking && MP_STATE_MEM(gc_first_free_atb_index) <= i && i <= MP_STATE_MEM(gc_last_free_atb_index); i += direction) {
for (size_t i = start; keep_looking && first_free <= i && i <= MP_STATE_MEM(gc_last_free_atb_index); i += direction) {
byte a = MP_STATE_MEM(gc_alloc_table_start)[i];
// Four ATB states are packed into a single byte.
int j = 0;
@ -565,22 +573,24 @@ void *gc_alloc(size_t n_bytes, bool has_finaliser, bool long_lived) {
// Found free space ending at found_block inclusive.
// Also, set last free ATB index to block after last block we found, for start of
// next scan. To reduce fragmentation, we only do this if we were looking
// for a single free block, which guarantees that there are no free blocks
// before this one. Also, whenever we free or shrink a block we must check
// if this index needs adjusting (see gc_realloc and gc_free).
// next scan. Also, whenever we free or shrink a block we must check if this index needs
// adjusting (see gc_realloc and gc_free).
if (!long_lived) {
end_block = found_block;
start_block = found_block - n_free + 1;
if (n_blocks == 1) {
MP_STATE_MEM(gc_first_free_atb_index) = (found_block + 1) / BLOCKS_PER_ATB;
if (n_blocks < MICROPY_ATB_INDICES) {
size_t next_free_atb = (found_block + n_blocks) / BLOCKS_PER_ATB;
// Update all atb indices for larger blocks too.
for (size_t i = n_blocks - 1; i < MICROPY_ATB_INDICES; i++) {
MP_STATE_MEM(gc_first_free_atb_index)[i] = next_free_atb;
}
}
} else {
start_block = found_block;
end_block = found_block + n_free - 1;
if (n_blocks == 1) {
MP_STATE_MEM(gc_last_free_atb_index) = (found_block - 1) / BLOCKS_PER_ATB;
}
// Always update the bounds of the long lived area because we assume it is contiguous. (It
// can still be reset by a sweep.)
MP_STATE_MEM(gc_last_free_atb_index) = (found_block - 1) / BLOCKS_PER_ATB;
}
#ifdef LOG_HEAP_ACTIVITY
@ -676,30 +686,37 @@ void gc_free(void *ptr) {
}
// get the GC block number corresponding to this pointer
assert(VERIFY_PTR(ptr));
size_t block = BLOCK_FROM_PTR(ptr);
assert(ATB_GET_KIND(block) == AT_HEAD);
size_t start_block = BLOCK_FROM_PTR(ptr);
assert(ATB_GET_KIND(start_block) == AT_HEAD);
#if MICROPY_ENABLE_FINALISER
FTB_CLEAR(block);
FTB_CLEAR(start_block);
#endif
// set the last_free pointer to this block if it's earlier in the heap
if (block / BLOCKS_PER_ATB < MP_STATE_MEM(gc_first_free_atb_index)) {
MP_STATE_MEM(gc_first_free_atb_index) = block / BLOCKS_PER_ATB;
}
if (block / BLOCKS_PER_ATB > MP_STATE_MEM(gc_last_free_atb_index)) {
MP_STATE_MEM(gc_last_free_atb_index) = block / BLOCKS_PER_ATB;
}
// free head and all of its tail blocks
#ifdef LOG_HEAP_ACTIVITY
gc_log_change(block, 0);
#endif
#ifdef LOG_HEAP_ACTIVITY
gc_log_change(start_block, 0);
#endif
size_t block = start_block;
do {
ATB_ANY_TO_FREE(block);
block += 1;
} while (ATB_GET_KIND(block) == AT_TAIL);
// Update the first free pointer for our size only. Not much calls gc_free directly so there
// is decent chance we'll want to allocate this size again. By only updating the specific
// size we don't risk something smaller fitting in.
size_t n_blocks = block - start_block;
size_t bucket = MIN(n_blocks, MICROPY_ATB_INDICES) - 1;
size_t new_free_atb = start_block / BLOCKS_PER_ATB;
if (new_free_atb < MP_STATE_MEM(gc_first_free_atb_index)[bucket]) {
MP_STATE_MEM(gc_first_free_atb_index)[bucket] = new_free_atb;
}
// set the last_free pointer to this block if it's earlier in the heap
if (new_free_atb > MP_STATE_MEM(gc_last_free_atb_index)) {
MP_STATE_MEM(gc_last_free_atb_index) = new_free_atb;
}
GC_EXIT();
#if EXTENSIVE_HEAP_PROFILING
@ -870,11 +887,13 @@ void *gc_realloc(void *ptr_in, size_t n_bytes, bool allow_move) {
}
// set the last_free pointer to end of this block if it's earlier in the heap
if ((block + new_blocks) / BLOCKS_PER_ATB < MP_STATE_MEM(gc_first_free_atb_index)) {
MP_STATE_MEM(gc_first_free_atb_index) = (block + new_blocks) / BLOCKS_PER_ATB;
size_t new_free_atb = (block + new_blocks) / BLOCKS_PER_ATB;
size_t bucket = MIN(n_blocks - new_blocks, MICROPY_ATB_INDICES) - 1;
if (new_free_atb < MP_STATE_MEM(gc_first_free_atb_index)[bucket]) {
MP_STATE_MEM(gc_first_free_atb_index)[bucket] = new_free_atb;
}
if ((block + new_blocks) / BLOCKS_PER_ATB > MP_STATE_MEM(gc_last_free_atb_index)) {
MP_STATE_MEM(gc_last_free_atb_index) = (block + new_blocks) / BLOCKS_PER_ATB;
if (new_free_atb > MP_STATE_MEM(gc_last_free_atb_index)) {
MP_STATE_MEM(gc_last_free_atb_index) = new_free_atb;
}
GC_EXIT();

View File

@ -244,6 +244,14 @@
#define alloca(x) m_malloc(x)
#endif
// Number of atb indices to cache. Allocations of fewer blocks will be faster
// because the search will be accelerated by the index cache. This only applies
// to short lived allocations because we assume the long lived allocations are
// contiguous.
#ifndef MICROPY_ATB_INDICES
#define MICROPY_ATB_INDICES (8)
#endif
/*****************************************************************************/
/* MicroPython emitters */

View File

@ -92,7 +92,7 @@ typedef struct _mp_state_mem_t {
size_t gc_alloc_threshold;
#endif
size_t gc_first_free_atb_index;
size_t gc_first_free_atb_index[MICROPY_ATB_INDICES];
size_t gc_last_free_atb_index;
#if MICROPY_PY_GC_COLLECT_RETVAL

View File

@ -13,7 +13,7 @@ correct port. GDB is usually :3333 and JLink is :2331.
Now, run gdb from your port directory:
```
arm-none-eabi-gdb -x ../tools/output_gc_until_repl.txt build-metro_m0_express/firmware.elf
arm-none-eabi-gdb -x ../../tools/output_gc_until_repl.txt build-metro_m0_express/firmware.elf
```
This will take a little time while it breaks, backtraces and continues for every

View File

@ -0,0 +1,141 @@
import sys
import json
# Map start block to current allocation info.
current_heap = {}
allocation_history = []
root = {}
def change_root(trace, size):
level = root
for frame in reversed(trace):
file_location = frame[1]
if file_location not in level:
level[file_location] = {"blocks": 0,
"file": file_location,
"function": frame[2],
"subcalls": {}}
level[file_location]["blocks"] += size
level = level[file_location]["subcalls"]
total_actions = 0
non_single_block_streak = 0
max_nsbs = 0
last_action = None
last_total_actions = 0
count = 0
actions = {}
last_ticks_ms = 0
ticks_ms = 0
block_sizes = {}
allocation_sources = {}
with open(sys.argv[1], "r") as f:
for line in f:
if not line.strip():
break
for line in f:
action = None
if line.startswith("Breakpoint 2"):
break
next(f) # throw away breakpoint code line
# print(next(f)) # first frame
block = 0
size = 0
trace = []
for line in f:
# print(line.strip())
if line[0] == "#":
frame = line.strip().split()
if frame[1].startswith("0x"):
trace.append((frame[1], frame[-1], frame[3]))
else:
trace.append(("0x0", frame[-1], frame[1]))
elif line[0] == "$":
#print(line.strip().split()[-1])
block = int(line.strip().split()[-1][2:], 16)
next_line = next(f)
size = int(next_line.strip().split()[-1][2:], 16)
# next_line = next(f)
# ticks_ms = int(next_line.strip().split()[-1][2:], 16)
if not line.strip():
break
action = "unknown"
if block not in current_heap:
current_heap[block] = {"start_block": block, "size": size, "start_trace": trace, "start_time": total_actions}
action = "alloc"
if size == 1:
max_nsbs = max(max_nsbs, non_single_block_streak)
non_single_block_streak = 0
else:
non_single_block_streak += 1
#change_root(trace, size)
if size not in block_sizes:
block_sizes[size] = 0
source = trace[-1][-1]
if source not in allocation_sources:
print(trace)
allocation_sources[source] = 0
allocation_sources[source] += 1
block_sizes[size] += 1
else:
alloc = current_heap[block]
alloc["end_trace"] = trace
alloc["end_time"] = total_actions
change_root(alloc["start_trace"], -1 * alloc["size"])
if size > 0:
action = "realloc"
current_heap[block] = {"start_block": block, "size": size, "start_trace": trace, "start_time": total_actions}
#change_root(trace, size)
else:
action = "free"
if trace[0][2] == "gc_sweep":
action = "sweep"
non_single_block_streak = 0
if (trace[3][2] == "py_gc_collect" or (trace[3][2] == "gc_deinit" and count > 1)) and last_action != "sweep":
print(ticks_ms - last_ticks_ms, total_actions - last_total_actions, "gc.collect", max_nsbs)
print(actions)
print(block_sizes)
print(allocation_sources)
actions = {}
block_sizes = {}
allocation_sources = {}
if count % 2 == 0:
print()
count += 1
last_total_actions = total_actions
last_ticks_ms = ticks_ms
max_nsbs = 0
del current_heap[block]
alloc["end_cause"] = action
allocation_history.append(alloc)
if action not in actions:
actions[action] = 0
actions[action] += 1
last_action = action
#print(total_actions, non_single_block_streak, action, block, size)
total_actions += 1
print(actions)
print(max_nsbs)
print()
for alloc in current_heap.values():
alloc["end_trace"] = ""
alloc["end_time"] = total_actions
allocation_history.append(alloc)
def print_frame(frame, indent=0):
for key in sorted(frame):
if not frame[key]["blocks"] or key.startswith("../py/malloc.c") or key.startswith("../py/gc.c"):
continue
print(" " * (indent - 1), key, frame[key]["function"], frame[key]["blocks"], "blocks")
print_frame(frame[key]["subcalls"], indent + 2)
# print_frame(root)
# total_blocks = 0
# for key in sorted(root):
# total_blocks += root[key]["blocks"]
# print(total_blocks, "total blocks")
# with open("allocation_history.json", "w") as f:
# json.dump(allocation_history, f)

View File

@ -10,16 +10,23 @@ set logging on
set remote hardware-breakpoint-limit 4
# gc log
break gc.c:103
break gc.c:106
commands
backtrace
p/x start_block
p/x length
append binary memory ram.bin &_srelocate &_estack
p/x ticks_ms
# backtrace output redirect is currently broken in gdb so we use up instead.
# https://sourceware.org/bugzilla/show_bug.cgi?id=23439
# backtrace
up
up
up
up
# append binary memory ram.bin &_srelocate &_estack
continue
end
break main.c:179
break main.c:251
continue