Skip to content

Commit 01fb668

Browse files
authored
Merge pull request #2614 from tannewt/gc_multiblock_speedup
Track first free atbs for multiple block sizes instead of just 1
2 parents 35abc48 + 36e6cc8 commit 01fb668

File tree

6 files changed

+212
-37
lines changed

6 files changed

+212
-37
lines changed

py/gc.c

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,13 @@ void gc_init(void *start, void *end) {
150150
#endif
151151

152152
// Set first free ATB index to the start of the heap.
153-
MP_STATE_MEM(gc_first_free_atb_index) = 0;
153+
for (size_t i = 0; i < MICROPY_ATB_INDICES; i++) {
154+
MP_STATE_MEM(gc_first_free_atb_index)[i] = 0;
155+
}
156+
154157
// Set last free ATB index to the end of the heap.
155158
MP_STATE_MEM(gc_last_free_atb_index) = MP_STATE_MEM(gc_alloc_table_byte_len) - 1;
159+
156160
// Set the lowest long lived ptr to the end of the heap to start. This will be lowered as long
157161
// lived objects are allocated.
158162
MP_STATE_MEM(gc_lowest_long_lived_ptr) = (void*) PTR_FROM_BLOCK(MP_STATE_MEM(gc_alloc_table_byte_len * BLOCKS_PER_ATB));
@@ -387,7 +391,9 @@ void gc_collect_root(void **ptrs, size_t len) {
387391
void gc_collect_end(void) {
388392
gc_deal_with_stack_overflow();
389393
gc_sweep();
390-
MP_STATE_MEM(gc_first_free_atb_index) = 0;
394+
for (size_t i = 0; i < MICROPY_ATB_INDICES; i++) {
395+
MP_STATE_MEM(gc_first_free_atb_index)[i] = 0;
396+
}
391397
MP_STATE_MEM(gc_last_free_atb_index) = MP_STATE_MEM(gc_alloc_table_byte_len) - 1;
392398
MP_STATE_MEM(gc_lock_depth)--;
393399
GC_EXIT();
@@ -513,14 +519,16 @@ void *gc_alloc(size_t n_bytes, bool has_finaliser, bool long_lived) {
513519
size_t crossover_block = BLOCK_FROM_PTR(MP_STATE_MEM(gc_lowest_long_lived_ptr));
514520
while (keep_looking) {
515521
int8_t direction = 1;
516-
size_t start = MP_STATE_MEM(gc_first_free_atb_index);
522+
size_t bucket = MIN(n_blocks, MICROPY_ATB_INDICES) - 1;
523+
size_t first_free = MP_STATE_MEM(gc_first_free_atb_index)[bucket];
524+
size_t start = first_free;
517525
if (long_lived) {
518526
direction = -1;
519527
start = MP_STATE_MEM(gc_last_free_atb_index);
520528
}
521529
n_free = 0;
522530
// look for a run of n_blocks available blocks
523-
for (size_t i = start; keep_looking && MP_STATE_MEM(gc_first_free_atb_index) <= i && i <= MP_STATE_MEM(gc_last_free_atb_index); i += direction) {
531+
for (size_t i = start; keep_looking && first_free <= i && i <= MP_STATE_MEM(gc_last_free_atb_index); i += direction) {
524532
byte a = MP_STATE_MEM(gc_alloc_table_start)[i];
525533
// Four ATB states are packed into a single byte.
526534
int j = 0;
@@ -565,22 +573,24 @@ void *gc_alloc(size_t n_bytes, bool has_finaliser, bool long_lived) {
565573

566574
// Found free space ending at found_block inclusive.
567575
// Also, set last free ATB index to block after last block we found, for start of
568-
// next scan. To reduce fragmentation, we only do this if we were looking
569-
// for a single free block, which guarantees that there are no free blocks
570-
// before this one. Also, whenever we free or shrink a block we must check
571-
// if this index needs adjusting (see gc_realloc and gc_free).
576+
// next scan. Also, whenever we free or shrink a block we must check if this index needs
577+
// adjusting (see gc_realloc and gc_free).
572578
if (!long_lived) {
573579
end_block = found_block;
574580
start_block = found_block - n_free + 1;
575-
if (n_blocks == 1) {
576-
MP_STATE_MEM(gc_first_free_atb_index) = (found_block + 1) / BLOCKS_PER_ATB;
581+
if (n_blocks < MICROPY_ATB_INDICES) {
582+
size_t next_free_atb = (found_block + n_blocks) / BLOCKS_PER_ATB;
583+
// Update all atb indices for larger blocks too.
584+
for (size_t i = n_blocks - 1; i < MICROPY_ATB_INDICES; i++) {
585+
MP_STATE_MEM(gc_first_free_atb_index)[i] = next_free_atb;
586+
}
577587
}
578588
} else {
579589
start_block = found_block;
580590
end_block = found_block + n_free - 1;
581-
if (n_blocks == 1) {
582-
MP_STATE_MEM(gc_last_free_atb_index) = (found_block - 1) / BLOCKS_PER_ATB;
583-
}
591+
// Always update the bounds of the long lived area because we assume it is contiguous. (It
592+
// can still be reset by a sweep.)
593+
MP_STATE_MEM(gc_last_free_atb_index) = (found_block - 1) / BLOCKS_PER_ATB;
584594
}
585595

586596
#ifdef LOG_HEAP_ACTIVITY
@@ -676,30 +686,37 @@ void gc_free(void *ptr) {
676686
}
677687
// get the GC block number corresponding to this pointer
678688
assert(VERIFY_PTR(ptr));
679-
size_t block = BLOCK_FROM_PTR(ptr);
680-
assert(ATB_GET_KIND(block) == AT_HEAD);
689+
size_t start_block = BLOCK_FROM_PTR(ptr);
690+
assert(ATB_GET_KIND(start_block) == AT_HEAD);
681691

682692
#if MICROPY_ENABLE_FINALISER
683-
FTB_CLEAR(block);
693+
FTB_CLEAR(start_block);
684694
#endif
685695

686-
// set the last_free pointer to this block if it's earlier in the heap
687-
if (block / BLOCKS_PER_ATB < MP_STATE_MEM(gc_first_free_atb_index)) {
688-
MP_STATE_MEM(gc_first_free_atb_index) = block / BLOCKS_PER_ATB;
689-
}
690-
if (block / BLOCKS_PER_ATB > MP_STATE_MEM(gc_last_free_atb_index)) {
691-
MP_STATE_MEM(gc_last_free_atb_index) = block / BLOCKS_PER_ATB;
692-
}
693-
694696
// free head and all of its tail blocks
695-
#ifdef LOG_HEAP_ACTIVITY
696-
gc_log_change(block, 0);
697-
#endif
697+
#ifdef LOG_HEAP_ACTIVITY
698+
gc_log_change(start_block, 0);
699+
#endif
700+
size_t block = start_block;
698701
do {
699702
ATB_ANY_TO_FREE(block);
700703
block += 1;
701704
} while (ATB_GET_KIND(block) == AT_TAIL);
702705

706+
// Update the first free pointer for our size only. Not much calls gc_free directly so there
707+
// is decent chance we'll want to allocate this size again. By only updating the specific
708+
// size we don't risk something smaller fitting in.
709+
size_t n_blocks = block - start_block;
710+
size_t bucket = MIN(n_blocks, MICROPY_ATB_INDICES) - 1;
711+
size_t new_free_atb = start_block / BLOCKS_PER_ATB;
712+
if (new_free_atb < MP_STATE_MEM(gc_first_free_atb_index)[bucket]) {
713+
MP_STATE_MEM(gc_first_free_atb_index)[bucket] = new_free_atb;
714+
}
715+
// set the last_free pointer to this block if it's earlier in the heap
716+
if (new_free_atb > MP_STATE_MEM(gc_last_free_atb_index)) {
717+
MP_STATE_MEM(gc_last_free_atb_index) = new_free_atb;
718+
}
719+
703720
GC_EXIT();
704721

705722
#if EXTENSIVE_HEAP_PROFILING
@@ -870,11 +887,13 @@ void *gc_realloc(void *ptr_in, size_t n_bytes, bool allow_move) {
870887
}
871888

872889
// set the last_free pointer to end of this block if it's earlier in the heap
873-
if ((block + new_blocks) / BLOCKS_PER_ATB < MP_STATE_MEM(gc_first_free_atb_index)) {
874-
MP_STATE_MEM(gc_first_free_atb_index) = (block + new_blocks) / BLOCKS_PER_ATB;
890+
size_t new_free_atb = (block + new_blocks) / BLOCKS_PER_ATB;
891+
size_t bucket = MIN(n_blocks - new_blocks, MICROPY_ATB_INDICES) - 1;
892+
if (new_free_atb < MP_STATE_MEM(gc_first_free_atb_index)[bucket]) {
893+
MP_STATE_MEM(gc_first_free_atb_index)[bucket] = new_free_atb;
875894
}
876-
if ((block + new_blocks) / BLOCKS_PER_ATB > MP_STATE_MEM(gc_last_free_atb_index)) {
877-
MP_STATE_MEM(gc_last_free_atb_index) = (block + new_blocks) / BLOCKS_PER_ATB;
895+
if (new_free_atb > MP_STATE_MEM(gc_last_free_atb_index)) {
896+
MP_STATE_MEM(gc_last_free_atb_index) = new_free_atb;
878897
}
879898

880899
GC_EXIT();

py/mpconfig.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,14 @@
244244
#define alloca(x) m_malloc(x)
245245
#endif
246246

247+
// Number of atb indices to cache. Allocations of fewer blocks will be faster
248+
// because the search will be accelerated by the index cache. This only applies
249+
// to short lived allocations because we assume the long lived allocations are
250+
// contiguous.
251+
#ifndef MICROPY_ATB_INDICES
252+
#define MICROPY_ATB_INDICES (8)
253+
#endif
254+
247255
/*****************************************************************************/
248256
/* MicroPython emitters */
249257

py/mpstate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ typedef struct _mp_state_mem_t {
9292
size_t gc_alloc_threshold;
9393
#endif
9494

95-
size_t gc_first_free_atb_index;
95+
size_t gc_first_free_atb_index[MICROPY_ATB_INDICES];
9696
size_t gc_last_free_atb_index;
9797

9898
#if MICROPY_PY_GC_COLLECT_RETVAL

tools/gc_activity.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ correct port. GDB is usually :3333 and JLink is :2331.
1313
Now, run gdb from your port directory:
1414

1515
```
16-
arm-none-eabi-gdb -x ../tools/output_gc_until_repl.txt build-metro_m0_express/firmware.elf
16+
arm-none-eabi-gdb -x ../../tools/output_gc_until_repl.txt build-metro_m0_express/firmware.elf
1717
```
1818

1919
This will take a little time while it breaks, backtraces and continues for every

tools/gc_activity_between_collects.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import sys
2+
import json
3+
4+
# Map start block to current allocation info.
5+
current_heap = {}
6+
allocation_history = []
7+
root = {}
8+
9+
def change_root(trace, size):
10+
level = root
11+
for frame in reversed(trace):
12+
file_location = frame[1]
13+
if file_location not in level:
14+
level[file_location] = {"blocks": 0,
15+
"file": file_location,
16+
"function": frame[2],
17+
"subcalls": {}}
18+
level[file_location]["blocks"] += size
19+
level = level[file_location]["subcalls"]
20+
21+
total_actions = 0
22+
non_single_block_streak = 0
23+
max_nsbs = 0
24+
last_action = None
25+
last_total_actions = 0
26+
count = 0
27+
actions = {}
28+
last_ticks_ms = 0
29+
ticks_ms = 0
30+
block_sizes = {}
31+
allocation_sources = {}
32+
with open(sys.argv[1], "r") as f:
33+
for line in f:
34+
if not line.strip():
35+
break
36+
for line in f:
37+
action = None
38+
if line.startswith("Breakpoint 2"):
39+
break
40+
next(f) # throw away breakpoint code line
41+
# print(next(f)) # first frame
42+
block = 0
43+
size = 0
44+
trace = []
45+
for line in f:
46+
# print(line.strip())
47+
if line[0] == "#":
48+
frame = line.strip().split()
49+
if frame[1].startswith("0x"):
50+
trace.append((frame[1], frame[-1], frame[3]))
51+
else:
52+
trace.append(("0x0", frame[-1], frame[1]))
53+
elif line[0] == "$":
54+
#print(line.strip().split()[-1])
55+
block = int(line.strip().split()[-1][2:], 16)
56+
next_line = next(f)
57+
size = int(next_line.strip().split()[-1][2:], 16)
58+
# next_line = next(f)
59+
# ticks_ms = int(next_line.strip().split()[-1][2:], 16)
60+
if not line.strip():
61+
break
62+
63+
action = "unknown"
64+
if block not in current_heap:
65+
current_heap[block] = {"start_block": block, "size": size, "start_trace": trace, "start_time": total_actions}
66+
action = "alloc"
67+
if size == 1:
68+
max_nsbs = max(max_nsbs, non_single_block_streak)
69+
non_single_block_streak = 0
70+
else:
71+
non_single_block_streak += 1
72+
#change_root(trace, size)
73+
if size not in block_sizes:
74+
block_sizes[size] = 0
75+
source = trace[-1][-1]
76+
if source not in allocation_sources:
77+
print(trace)
78+
allocation_sources[source] = 0
79+
allocation_sources[source] += 1
80+
block_sizes[size] += 1
81+
else:
82+
alloc = current_heap[block]
83+
alloc["end_trace"] = trace
84+
alloc["end_time"] = total_actions
85+
change_root(alloc["start_trace"], -1 * alloc["size"])
86+
if size > 0:
87+
action = "realloc"
88+
current_heap[block] = {"start_block": block, "size": size, "start_trace": trace, "start_time": total_actions}
89+
#change_root(trace, size)
90+
else:
91+
action = "free"
92+
if trace[0][2] == "gc_sweep":
93+
action = "sweep"
94+
non_single_block_streak = 0
95+
if (trace[3][2] == "py_gc_collect" or (trace[3][2] == "gc_deinit" and count > 1)) and last_action != "sweep":
96+
print(ticks_ms - last_ticks_ms, total_actions - last_total_actions, "gc.collect", max_nsbs)
97+
print(actions)
98+
print(block_sizes)
99+
print(allocation_sources)
100+
actions = {}
101+
block_sizes = {}
102+
allocation_sources = {}
103+
if count % 2 == 0:
104+
print()
105+
count += 1
106+
last_total_actions = total_actions
107+
last_ticks_ms = ticks_ms
108+
max_nsbs = 0
109+
del current_heap[block]
110+
alloc["end_cause"] = action
111+
allocation_history.append(alloc)
112+
if action not in actions:
113+
actions[action] = 0
114+
actions[action] += 1
115+
last_action = action
116+
#print(total_actions, non_single_block_streak, action, block, size)
117+
total_actions += 1
118+
print(actions)
119+
print(max_nsbs)
120+
print()
121+
122+
for alloc in current_heap.values():
123+
alloc["end_trace"] = ""
124+
alloc["end_time"] = total_actions
125+
allocation_history.append(alloc)
126+
127+
def print_frame(frame, indent=0):
128+
for key in sorted(frame):
129+
if not frame[key]["blocks"] or key.startswith("../py/malloc.c") or key.startswith("../py/gc.c"):
130+
continue
131+
print(" " * (indent - 1), key, frame[key]["function"], frame[key]["blocks"], "blocks")
132+
print_frame(frame[key]["subcalls"], indent + 2)
133+
134+
# print_frame(root)
135+
# total_blocks = 0
136+
# for key in sorted(root):
137+
# total_blocks += root[key]["blocks"]
138+
# print(total_blocks, "total blocks")
139+
140+
# with open("allocation_history.json", "w") as f:
141+
# json.dump(allocation_history, f)

tools/output_gc_until_repl.txt

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,23 @@ set logging on
1010
set remote hardware-breakpoint-limit 4
1111

1212
# gc log
13-
break gc.c:103
13+
break gc.c:106
1414
commands
15-
backtrace
1615
p/x start_block
1716
p/x length
18-
append binary memory ram.bin &_srelocate &_estack
17+
p/x ticks_ms
18+
# backtrace output redirect is currently broken in gdb so we use up instead.
19+
# https://sourceware.org/bugzilla/show_bug.cgi?id=23439
20+
# backtrace
21+
up
22+
up
23+
up
24+
up
25+
# append binary memory ram.bin &_srelocate &_estack
1926
continue
2027
end
2128

22-
break main.c:179
29+
break main.c:251
2330

2431
continue
2532

0 commit comments

Comments
 (0)