|
3 | 3 | #include "Python.h"
|
4 | 4 |
|
5 | 5 | #include "pycore_abstract.h"
|
| 6 | +#include "pycore_bitutils.h" |
6 | 7 | #include "pycore_call.h"
|
7 | 8 | #include "pycore_ceval.h"
|
8 | 9 | #include "pycore_critical_section.h"
|
@@ -113,6 +114,21 @@ mark_executable(unsigned char *memory, size_t size)
|
113 | 114 |
|
114 | 115 | // JIT compiler stuff: /////////////////////////////////////////////////////////
|
115 | 116 |
|
| 117 | +#define SYMBOL_MASK_WORDS 4 |
| 118 | + |
| 119 | +typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS]; |
| 120 | + |
| 121 | +typedef struct { |
| 122 | + unsigned char *mem; |
| 123 | + symbol_mask mask; |
| 124 | + size_t size; |
| 125 | +} trampoline_state; |
| 126 | + |
| 127 | +typedef struct { |
| 128 | + trampoline_state trampolines; |
| 129 | + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; |
| 130 | +} jit_state; |
| 131 | + |
116 | 132 | // Warning! AArch64 requires you to get your hands dirty. These are your gloves:
|
117 | 133 |
|
118 | 134 | // value[value_start : value_start + len]
|
@@ -390,66 +406,126 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
|
390 | 406 | patch_32r(location, value);
|
391 | 407 | }
|
392 | 408 |
|
| 409 | +void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); |
| 410 | + |
393 | 411 | #include "jit_stencils.h"
|
394 | 412 |
|
| 413 | +#if defined(__aarch64__) || defined(_M_ARM64) |
| 414 | + #define TRAMPOLINE_SIZE 16 |
| 415 | +#else |
| 416 | + #define TRAMPOLINE_SIZE 0 |
| 417 | +#endif |
| 418 | + |
| 419 | +// Generate and patch AArch64 trampolines. The symbols to jump to are stored |
| 420 | +// in the jit_stencils.h in the symbols_map. |
| 421 | +void |
| 422 | +patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) |
| 423 | +{ |
| 424 | + // Masking is done modulo 32 as the mask is stored as an array of uint32_t |
| 425 | + const uint32_t symbol_mask = 1 << (ordinal % 32); |
| 426 | + const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; |
| 427 | + assert(symbol_mask & trampoline_mask); |
| 428 | + |
| 429 | + // Count the number of set bits in the trampoline mask lower than ordinal, |
| 430 | + // this gives the index into the array of trampolines. |
| 431 | + int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); |
| 432 | + for (int i = 0; i < ordinal / 32; i++) { |
| 433 | + index += _Py_popcount32(state->trampolines.mask[i]); |
| 434 | + } |
| 435 | + |
| 436 | + uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); |
| 437 | + assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); |
| 438 | + |
| 439 | + uint64_t value = (uintptr_t)symbols_map[ordinal]; |
| 440 | + |
| 441 | + /* Generate the trampoline |
| 442 | + 0: 58000048 ldr x8, 8 |
| 443 | + 4: d61f0100 br x8 |
| 444 | + 8: 00000000 // The next two words contain the 64-bit address to jump to. |
| 445 | + c: 00000000 |
| 446 | + */ |
| 447 | + p[0] = 0x58000048; |
| 448 | + p[1] = 0xD61F0100; |
| 449 | + p[2] = value & 0xffffffff; |
| 450 | + p[3] = value >> 32; |
| 451 | + |
| 452 | + patch_aarch64_26r(location, (uintptr_t)p); |
| 453 | +} |
| 454 | + |
| 455 | +static void |
| 456 | +combine_symbol_mask(const symbol_mask src, symbol_mask dest) |
| 457 | +{ |
| 458 | + // Calculate the union of the trampolines required by each StencilGroup |
| 459 | + for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) { |
| 460 | + dest[i] |= src[i]; |
| 461 | + } |
| 462 | +} |
| 463 | + |
395 | 464 | // Compiles executor in-place. Don't forget to call _PyJIT_Free later!
|
396 | 465 | int
|
397 | 466 | _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length)
|
398 | 467 | {
|
399 | 468 | const StencilGroup *group;
|
400 | 469 | // Loop once to find the total compiled size:
|
401 |
| - uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; |
402 | 470 | size_t code_size = 0;
|
403 | 471 | size_t data_size = 0;
|
| 472 | + jit_state state = {}; |
404 | 473 | group = &trampoline;
|
405 | 474 | code_size += group->code_size;
|
406 | 475 | data_size += group->data_size;
|
407 | 476 | for (size_t i = 0; i < length; i++) {
|
408 | 477 | const _PyUOpInstruction *instruction = &trace[i];
|
409 | 478 | group = &stencil_groups[instruction->opcode];
|
410 |
| - instruction_starts[i] = code_size; |
| 479 | + state.instruction_starts[i] = code_size; |
411 | 480 | code_size += group->code_size;
|
412 | 481 | data_size += group->data_size;
|
| 482 | + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); |
413 | 483 | }
|
414 | 484 | group = &stencil_groups[_FATAL_ERROR];
|
415 | 485 | code_size += group->code_size;
|
416 | 486 | data_size += group->data_size;
|
| 487 | + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); |
| 488 | + // Calculate the size of the trampolines required by the whole trace |
| 489 | + for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { |
| 490 | + state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; |
| 491 | + } |
417 | 492 | // Round up to the nearest page:
|
418 | 493 | size_t page_size = get_page_size();
|
419 | 494 | assert((page_size & (page_size - 1)) == 0);
|
420 |
| - size_t padding = page_size - ((code_size + data_size) & (page_size - 1)); |
421 |
| - size_t total_size = code_size + data_size + padding; |
| 495 | + size_t padding = page_size - ((code_size + data_size + state.trampolines.size) & (page_size - 1)); |
| 496 | + size_t total_size = code_size + data_size + state.trampolines.size + padding; |
422 | 497 | unsigned char *memory = jit_alloc(total_size);
|
423 | 498 | if (memory == NULL) {
|
424 | 499 | return -1;
|
425 | 500 | }
|
426 | 501 | // Update the offsets of each instruction:
|
427 | 502 | for (size_t i = 0; i < length; i++) {
|
428 |
| - instruction_starts[i] += (uintptr_t)memory; |
| 503 | + state.instruction_starts[i] += (uintptr_t)memory; |
429 | 504 | }
|
430 | 505 | // Loop again to emit the code:
|
431 | 506 | unsigned char *code = memory;
|
432 | 507 | unsigned char *data = memory + code_size;
|
| 508 | + state.trampolines.mem = memory + code_size + data_size; |
433 | 509 | // Compile the trampoline, which handles converting between the native
|
434 | 510 | // calling convention and the calling convention used by jitted code
|
435 | 511 | // (which may be different for efficiency reasons). On platforms where
|
436 | 512 | // we don't change calling conventions, the trampoline is empty and
|
437 | 513 | // nothing is emitted here:
|
438 | 514 | group = &trampoline;
|
439 |
| - group->emit(code, data, executor, NULL, instruction_starts); |
| 515 | + group->emit(code, data, executor, NULL, &state); |
440 | 516 | code += group->code_size;
|
441 | 517 | data += group->data_size;
|
442 | 518 | assert(trace[0].opcode == _START_EXECUTOR);
|
443 | 519 | for (size_t i = 0; i < length; i++) {
|
444 | 520 | const _PyUOpInstruction *instruction = &trace[i];
|
445 | 521 | group = &stencil_groups[instruction->opcode];
|
446 |
| - group->emit(code, data, executor, instruction, instruction_starts); |
| 522 | + group->emit(code, data, executor, instruction, &state); |
447 | 523 | code += group->code_size;
|
448 | 524 | data += group->data_size;
|
449 | 525 | }
|
450 | 526 | // Protect against accidental buffer overrun into data:
|
451 | 527 | group = &stencil_groups[_FATAL_ERROR];
|
452 |
| - group->emit(code, data, executor, NULL, instruction_starts); |
| 528 | + group->emit(code, data, executor, NULL, &state); |
453 | 529 | code += group->code_size;
|
454 | 530 | data += group->data_size;
|
455 | 531 | assert(code == memory + code_size);
|
|
0 commit comments