-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode #130037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) ChangesIn dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts This patch extends Full diff: https://github.com/llvm/llvm-project/pull/130037.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7e6bce2bf5f12..42ef23e836a58 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
- // stores. In this case it can be useful to send a message to explicitly
- // release all VGPRs before the stores have completed, but it is only safe to
- // do this if:
- // * there are no outstanding scratch stores
- // * we are not in Dynamic VGPR mode
+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+ // Technically the hardware will do this on its own if we don't, but that
+ // might cost extra cycles compared to doing it explicitly.
+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+ // have to wait for outstanding VMEM stores. In this case it can be useful to
+ // send a message to explicitly release all VGPRs before the stores have
+ // completed, but it is only safe to do this if there are no outstanding
+ // scratch stores.
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
- ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
- !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
+ if (!WCG->isOptNone() &&
+ (ST->isDynamicVGPREnabled() ||
+ (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+ !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
@@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}
- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
- // instructions.
+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
+ // This is done in different ways depending on how the VGPRs were allocated
+ // (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
- if (!ReleaseVGPRInsts.empty() &&
- (MF.getFrameInfo().hasCalls() ||
- ST->getOccupancyWithNumVGPRs(
- TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
- AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ if (ST->isDynamicVGPREnabled()) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
- if (ST->requiresNopBeforeDeallocVGPRs()) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_NOP))
- .addImm(0);
- }
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_SENDMSG))
- .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ TII->get(AMDGPU::S_ALLOC_VGPR))
+ .addImm(0);
Modified = true;
}
+ } else {
+ if (!ReleaseVGPRInsts.empty() &&
+ (MF.getFrameInfo().hasCalls() ||
+ ST->getOccupancyWithNumVGPRs(
+ TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+ AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+ for (MachineInstr *MI : ReleaseVGPRInsts) {
+ if (ST->requiresNopBeforeDeallocVGPRs()) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ }
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ Modified = true;
+ }
+ }
}
ReleaseVGPRInsts.clear();
PreheadersToFlush.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
new file mode 100644
index 0000000000000..884b5f8b6f018
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -0,0 +1,356 @@
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+
+--- |
+ define amdgpu_ps void @tbuffer_store1() { ret void }
+ define amdgpu_ps void @tbuffer_store2() { ret void }
+ define amdgpu_ps void @flat_store() { ret void }
+ define amdgpu_ps void @global_store() { ret void }
+ define amdgpu_ps void @buffer_store_format() { ret void }
+ define amdgpu_ps void @ds_write_b32() { ret void }
+ define amdgpu_ps void @global_store_dword() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks1() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks2() { ret void }
+ define amdgpu_ps void @multiple_basic_blocks3() { ret void }
+ define amdgpu_ps void @recursive_loop() { ret void }
+ define amdgpu_ps void @recursive_loop_vmem() { ret void }
+ define amdgpu_ps void @image_store() { ret void }
+ define amdgpu_ps void @scratch_store() { ret void }
+ define amdgpu_ps void @buffer_atomic() { ret void }
+ define amdgpu_ps void @flat_atomic() { ret void }
+ define amdgpu_ps void @global_atomic() { ret void }
+ define amdgpu_ps void @image_atomic() { ret void }
+ define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+...
+
+---
+name: tbuffer_store1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: tbuffer_store2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store2
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: global_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
+
+---
+name: buffer_store_format
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_store_format
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: ds_write_b32
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: ds_write_b32
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = IMPLICIT_DEF
+ DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+ S_ENDPGM 0
+
+...
+---
+name: global_store_dword
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: global_store_dword
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks1
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: multiple_basic_blocks2
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks2
+ ; CHECK: bb.2:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks3
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks3
+ ; CHECK: bb.4:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.4
+
+ S_BRANCH %bb.4
+
+ bb.3:
+ successors: %bb.4
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop
+body: |
+ ; CHECK-LABEL: name: recursive_loop
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop_vmem
+body: |
+ ; CHECK-LABEL: name: recursive_loop_vmem
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: image_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: scratch_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: scratch_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+ SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: buffer_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+
+---
+name: global_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: image_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; DEFAULT-NOT: S_ALLOC_VGPR
+ ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: global_store_optnone
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store_optnone
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK-NOT: S_ALLOC_VGPR
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
|
# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT | ||
# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need -O2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't we? We only generate the S_ALLOC_VGPR when optimizing (there's one test where we don't because we put optnone on the function).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-O2 is the default, and you are using run-pass anyway. Plus optnone != -O0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, right. Fair point.
# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT | ||
# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR | ||
|
||
--- | |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the IR section needed just for the calling conventions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also for the optnone on the last function. Do we have a way to communicate that in MIR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No
b2a7bdc
to
5f73d9e
Compare
In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0` before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction. This patch extends `si-insert-waitcnts` to deallocate the VGPRs via a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
3d09c94
to
6b7d174
Compare
In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts
S_ALLOC_VGPR 0
before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction.This patch extends
si-insert-waitcnts
to deallocate the VGPRs via aS_ALLOC_VGPR 0
before anyS_ENDPGM
when in dynamic VGPR mode.