-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction #144423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Permlane_swap instruction depends on exec mask, added isConvergent flag to prevent sinking of instruction. Fixes SWDEV-537232
@llvm/pr-subscribers-backend-amdgpu Author: Vigneshwar Jayakumar (VigneshwarJ) ChangesPermlane_swap instruction depends on exec mask, added isConvergent flag to prevent sinking of instruction. Fixes SWDEV-537232 Full diff: https://github.com/llvm/llvm-project/pull/144423.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 7fdd951ecbd3c..d6718c556db1e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -775,7 +775,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
DisableEncoding="$vdst_in,$src0_out",
- SchedRW = [Write32Bit, Write32Bit] in {
+ SchedRW = [Write32Bit, Write32Bit],
+ isConvergent = 1 in {
let SubtargetPredicate = HasPermlane16Swap in {
defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>;
}
@@ -1550,8 +1551,11 @@ defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>;
+
+let isConvergent = 1 in {
defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>;
defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>;
+}
class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
(vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 952ee2fe2c955..c0c874300ef0f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -15,6 +15,7 @@ class LetDummies {
bit isConvertibleToThreeAddress;
bit isMoveImm;
bit isReMaterializable;
+ bit isConvergent;
bit isAsCheapAsAMove;
bit FPDPRounding;
Predicate SubtargetPredicate;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 0fc31ea9d6437..3cdba54026ccc 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -733,3 +733,64 @@ body: |
liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
S_ENDPGM 0
...
+---
+name: test_no_sink_permlane_swap
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-LABEL: name: test_no_sink_permlane_swap
+ ; GFX9: bb.0:
+ ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-NEXT: liveins: $vgpr0
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-NEXT: [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-NEXT: S_BRANCH %bb.1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1:
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2:
+ ; GFX9-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1
+ ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.3:
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]]
+ bb.0:
+ liveins: $vgpr0
+ %1:vgpr_32 = COPY $vgpr0
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64 = S_MOV_B64 0
+ %7:vreg_64 = COPY %5
+ %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ %10:vgpr_32, %11:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %9:vgpr_32, %9:vgpr_32, 0, 0, implicit $exec
+ %15:vgpr_32(s32) = COPY $vgpr0
+ %16:sreg_32 = S_MOV_B32 1
+ %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
+ %18:sreg_64 = COPY %17
+ %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ %20:vgpr_32 = V_MAX_I32_e64 %10:vgpr_32, %11:vgpr_32, implicit $exec
+
+ bb.2:
+ %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1
+ SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.3:
+ S_ENDPGM 0, implicit %22
+...
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Quoting myself from #142962: This seems reasonable, given that all the permlane intrinsics are marked as convergent. (Maybe TableGen should check that instruction selection patterns preserve the "convergent" property, like it does for some other properties.)
Should also do this for:
V_PERMLANE64_B32
V_PERMLANE16_VAR_B32
V_PERMLANEX16_VAR_B32
@jayfoad, Should I merge this and create a new PR for the other PERMLANE instructions you mentioned? |
Sure |
%10:vgpr_32, %11:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %9:vgpr_32, %9:vgpr_32, 0, 0, implicit $exec | ||
%15:vgpr_32(s32) = COPY $vgpr0 | ||
%16:sreg_32 = S_MOV_B32 1 | ||
%17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec | |
%17:sreg_64 = V_CMP_LT_I32_e64 %15, %16, implicit $exec |
Drop the accidental LLTs
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]] | ||
bb.0: | ||
liveins: $vgpr0 | ||
%1:vgpr_32 = COPY $vgpr0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should run -run-pass=none to compact the register numbers
This picks up a bug fix for AMDGPU v_permlane_swap: llvm/llvm-project#144423 Without this fix, the v_permlane_swap is wrongly sunk. Along the way we need to fix API changes: Add header file for the class IRBuilder Add missing default parameter in convertFuncOpToLLVMFuncOp
Permlane_swap instruction depends on exec mask, added isConvergent flag to prevent sinking of instruction.
Fixes: SWDEV-537232