-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Allocate i1 argument to SGPRs #72461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesCurrently i1 arguments are passed as 32-bit VGPRs. It would make more sense to make use of SGPRs and pass these values as a wavesize bool mask. Full diff: https://github.com/llvm/llvm-project/pull/72461.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bcb4..3f18cbd661d5b26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -185,9 +185,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
- CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i1] , CCAssignToReg<
+ !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
+ >>,
+
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
>>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4681004d3ba74ff..800f6cb84a806c7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3468,6 +3468,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
+ // In code below (after call of AnalyzeCallOperands),
+ // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+ // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+ // reserve these registers.
+ if (!Subtarget->enableFlatScratch()) {
+ if (IsChainCallConv)
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+ else
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+ AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ }
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0b11fd5f757cbe0..ac122107661f15e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -852,6 +852,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ // When calling convention allocates SGPR for i1 argument, we may
+ // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+ // the copy to avoid illegal copy.
+ if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ if (sub0 != DestReg)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -885,6 +895,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+ // from SGPR_32 to SReg_64. The following handles this case to avoid
+ // an illegal copy.
+ if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+ auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+ if (sub0 != SrcReg) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+ }
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+ RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+ return;
+ }
+
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a11f02e7db3504b..0817a2b164460a0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -503,6 +503,12 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
+ // When the calling convention allocates i1 argument to SGPR,
+ // we may have a COPY with dst being an SGPR_32. This should
+ // not be lowered into V_CNDMASK_B32.
+ if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+ continue;
+
Changed = true;
// Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 000000000000000..2fc4befa279f3e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s5, 0
+; CIGFX89-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 000000000000000..faf25e407fca2c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s3
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s4, -1
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_endpgm
+ call void @external_void_func_i1(i1 true)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 000000000000000..e63ae50b7e91cd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s5
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_getpc_b64 s[8:9]
+; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_getpc_b64 s[4:5]
+; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: s_endpgm
+ %var = load volatile i1, ptr addrspace(1) undef
+ call void @external_void_func_i1_signext(i1 signext %var)
+ ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
1c1086b
to
3aa0909
Compare
Some explanations. First the calling convention is changed such that i1 arg is allocated to SGPR_32. Then 2 problems have to be fixed to prevent compile errors: (1) on the callee side, handle SGPR_32 to SGPR_64 copies by copying the SGPR_32 to the lower 32b of the SGPR_64, and set the higher 32b to 0 (2) on the caller side, handle SGPR_64 to SGPR_32 copies by ignoring the higher 32b of the SGPR_64. I'm not sure if there are any potential problems with all the code changes and can't help thinking there might be a better solution. So, I'd like to have some comments before going any further. Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs test for i1 return values?
Patch doesn't seem wave size sensitive as it just attempts to fix up COPYs from s32 to s64 and vice versa -- but based on the test these COPYs seem to represent a deeper problem that is not being addressed.
3aa0909
to
9419956
Compare
The latest solution is quite different from the previous one. For calling convention, a custom function is created for i1 arg/return, which allocates i1 to either a 32b SGPR for wavesize32 or a 64b SGPR for wavesize64. Two additional changes are made: (1) in LowerFormalArgument(), previously it's assumed the all registers are from a 32b regclass. Now it's possible that it is from a 64b regclass. (2) In lowerCopiesToI1(), for i1 return value, previously it is assumed it's only from a 32b register. Now it can be from a 64b register. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests can be in one file (and possibly should just go in the existing ABI tests). They also need quite a few more tests:
- Test i1 vectors and i1 arrays for a few different sizes
- Test the case where the SGPR argument list will run out. In particular, test the case where a wave64 argument needs to straddle the boundary
- Test some mixed other inreg arguments, such that the alignment requirement for the boolean uses matters. e.g. a select user will require even aligned registers, but the argument will be odd
if (!SrcReg.isVirtual() && | ||
TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) { | ||
// When calling convention allocates SGPR for i1, for GPUs with | ||
// wavefront size 64, i1 return value is put in 64b SGPR. | ||
assert(ST->isWave64()); | ||
continue; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't know this is the context at this point. I don't think you should need to touch this. I assume you encountered a problematic copy here, what exactly does it look like?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Immediate after these new lines is an assert():
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
which makes sure SrcReg is 32b before lowering the copy to V_CMP_NE_U32_e64
.
Because, with the callingconv change, the i1 return value is in an SGPR_64 (for wave64 GPUs), these lines were inserted to avoid triggering the assert.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But contextually these shouldn't be getting processed. What is the MIR at the point of the assert?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The IR instruction callseq_end is always divergent. As a result, the CopyFromReg to copy the return value is also divergent. When the return type is i1, in the machine instruction, the dest reg would be vreg_1. In the above code in SILoweringI1Copies, we avoid lowing this COPY to avoid triggering the assert (code following assert assumes the src is a 32b phys reg).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you post the actual MIR you are getting? You really should not touch any code in SILowerI1Copies, you do not have the correct context here to know if you're doing something sensible.
callseq_end isn't associated with a value, it's just the SP modification and provides ordering. It's disconnected from the passed register value. In this context, the raw register copy is uniform, but the logical value is divergent. Coming of of the DAG for an incoming value, I would expect you to end up with something like:
%0:sreg_64_exec = COPY $sgpr0_sgpr1 # This ABI copy should exist as-is
%1:vreg_1 = COPY %0 # SILowerI1Copies is supposed to deal with this
For the outgoing side, I would expect:
%2:vreg_1 = ...
%3:sreg_64_exec = COPY %2
$sgpr0_sgpr1 = COPY %3 # Untouchable ABI copy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still think that both:
- The code already in SILowerI1Copies is nonsense which should be fixed
- You shouldn't need to touch it
I think the code that's reaching here isn't what you want. You want to have the original copy be the copy to the SReg_64_xexec/SReg_32_xexec, and a copy from there to VReg_1. You have the opposite copy pair here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- The code already in SILowerI1Copies is nonsense which should be fixed
Maybe create a separate ticket?
- You shouldn't need to touch it
I don't see a better alternative. Do you have any suggestions?
Here is a recap of the full story. I'm listing the .ll file, the init and final DAGs, the instruction selection results and the machine code.
// .ll file
define i1 @i1_func_void() {
%val = load i1, ptr addrspace(1) undef
ret i1 %val
}
define void @test_call_external_i1_func_void() {
%val = call i1 @i1_func_void()
store volatile i1 %val, ptr addrspace(1) undef
ret void
}
// 1. Initial DAG
Initial selection DAG: %bb.0 'test_call_external_i1_func_void:'
SelectionDAG has 17 nodes:
...
t9: ch,glue = CALL t6, GlobalAddress:i64<ptr @i1_func_void> 0, TargetGlobalAddress:i64<ptr @i1_func_void> 0, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, RegisterMask:Untyped, t6:1
t10: ch,glue = callseq_end # D:1 t9, TargetConstant:i64<0>, TargetConstant:i64<0>, t9:1
- t12: i1,ch,glue = CopyFromReg # D:1 t10, Register:i1 $sgpr0_sgpr1, t10:1
t14: i64 = Constant<0>
...
+ // Note that CopyFromReg is divergent
// 2. Final DAG
Optimized legalized selection DAG: %bb.0 'test_call_external_i1_func_void:'
SelectionDAG has 21 nodes:
...
t10: ch,glue = callseq_end # D:1 t9, TargetConstant:i64<0>, TargetConstant:i64<0>, t9:1
- t12: i1,ch,glue = CopyFromReg # D:1 t10, Register:i1 $sgpr0_sgpr1, t10:1
t29: i32 = zero_extend # D:1 t12
...
// 3. Instruction selection
===== Instruction selection ends:
Selected selection DAG: %bb.0 'test_call_external_i1_func_void:'
SelectionDAG has 23 nodes:
...
t10: i1,ch,glue = ADJCALLSTACKDOWN TargetConstant:i64<0>, TargetConstant:i64<0>, t9, t9:1
- t12: i1,ch,glue = CopyFromReg # D:1 t10:1, Register:i1 $sgpr0_sgpr1, t10:2
t29: i32 = V_CNDMASK_B32_e64 # D:1 TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<1>, t12
t27: ch = GLOBAL_STORE_BYTE<Mem:(volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)> # D:1 IMPLICIT_DEF:i64, t29, TargetConstant:i16<0>, TargetConstant:i32<0>, t12:1
...
// 4. Machine code at end of ISel
*** MachineFunction at end of ISel ***
# Machine code for function test_call_external_i1_func_void: IsSSA, TracksLiveness
...
ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
- %3:vreg_1 = COPY $sgpr0_sgpr1
- %5:sreg_64_xexec = COPY %3:vreg_1
%4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5:sreg_64_xexec, implicit $exec
%6:sreg_64 = IMPLICIT_DEF
%7:vreg_64 = COPY %6:sreg_64
GLOBAL_STORE_BYTE killed %7:vreg_64, killed %4:vgpr_32, 0, 0, implicit $exec :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
...
In InstrEmitter::EmitCopyFromReg()
the CopyFromReg insn transformed to the following:
%3:vreg_1 = COPY $sgpr0_sgpr1
%5:sreg_64_xexec = COPY %3:vreg_1
If I understand you correctly, I think you want the "vreg_1" in %3 to be "sreg_64_xexec". The relevant code of EmitCopyFromReg()
is as follows:
void InstrEmitter::EmitCopyFromReg(...)
...
// Stick to the preferred register classes for legal types.
if (TLI->isTypeLegal(VT))
- UseRC = TLI->getRegClassFor(VT, Node->isDivergent()); // UseRC is set to vreg_1 here
...
const TargetRegisterClass *RC = nullptr;
if (i + II.getNumDefs() < II.getNumOperands()) {
RC = TRI->getAllocatableClass(
TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF));
}
- if (!UseRC) // UseRC is already set, so it's not overwritten here
UseRC = RC;
...
if (VRBase) {
DstRC = MRI->getRegClass(VRBase);
} else if (UseRC) {
assert(TRI->isTypeLegalForClass(*UseRC, VT) &&
"Incompatible phys register def and uses!");
- DstRC = UseRC; // DstRC is set to UseRC, which is vreg_1
} else
DstRC = SrcRC;
...
As you can see, because UseRC
is set to vreg_1 near the beginning of the function, the final DstRC
is set to vreg_1 as well.
The code for SITargetLowering::getRegClassFor()
is as follows:
const TargetRegisterClass *SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) // Returns SReg_64 only for non-divergent SDNode.
return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
else if (TRI->isSGPRClass(RC) && isDivergent)
return TRI->getEquivalentVGPRClass(RC);
return RC;
}
I don't think changing the behavior of SITargetLowering::getRegClassFor()
is a good idea. So, I think the current solution, i.e. skipping the instruction, %3:sreg_64 = COPY $sgpr0_sgpr1
in SILowerI1Copies is preferred.
Pls let me know if you have any suggestions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm Any suggestions on SILowerI1Copies?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The virtual register copies are getting emitted backwards, yes.
You could hack around this in LowerFormalArguments by manually copying to a virtual register with the correct class, and then emitting a copy from that virtual register.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part is for i1 return value though, not i1 arguments.
.isWave64(); | ||
|
||
static const MCPhysReg I1RegList1[] = { | ||
AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to only pass in even aligned registers? We could also include all the aliasing odd-based cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By "odd-based cases" do you mean something like SGPR1_SGPR2? Do you have an example where such reg allocation is used?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. I mean if you had (i32 inreg %arg0, i64 inreg %arg1), you could use arg0=s0, arg1=s[1:2]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This issue deals with i1 arg/return only. Inreg args are handled separately. If you are thinking about a mixed case as follows:
foo (i32 inreg %arg0, i1 %arg1)
Currently for gfx900, %arg0 is assigned to s4, and %arg1 to s[6:7]. If in this case you want %arg1 to be given s[5:6], I suppose the list of registers can be changed from { ... sgpr4_sgpr5, sgpr6_sgpr7, ...}
to {... sgpr4_sgpr5, sgpr5_sgrp6, sgpr6_sgrp7, ...}
.
Is this what you had in mind?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm Pls let me know if the above is what you wanted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the i64 is physically what is being passed, but doesn't have the alignment requirement in the argument list.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems pairs of SGPRs that start with an odd number, e.g., SGPR5_SGPR6, are
not defined. Only pairs like SGPR4_SGPR5 are defined.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you have to unroll them into the 32-bit pieces and reassemble the virtual register
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also needs some AMDGPUUsage changes in the ABI sections
if (!SrcReg.isVirtual() && | ||
TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) { | ||
// When calling convention allocates SGPR for i1, for GPUs with | ||
// wavefront size 64, i1 return value is put in 64b SGPR. | ||
assert(ST->isWave64()); | ||
continue; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But contextually these shouldn't be getting processed. What is the MIR at the point of the assert?
Here is the machine code of the caller in z_return.ll at the beginning of SILowerI1Copies: bb.0 (%ir-block.0):
ADJCALLSTACKUP 0, 0, implicit-def dead $scc
%0:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @i1_func_void, target-flags(amdgpu-gotprel32-hi) @i1_func_void, implicit-def dead $scc
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %0:sreg_64, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
%2:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %2:sgpr_128
SI_CALL_ISEL killed %1:sreg_64_xexec, @i1_func_void, , implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
%3:vreg_1 = COPY $sgpr0_sgpr1 // srcReg is 64b
%5:sreg_64_xexec = COPY %3:vreg_1
%4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5:sreg_64_xexec, implicit $exec
%6:sreg_64 = IMPLICIT_DEF
%7:vreg_64 = COPY %6:sreg_64
GLOBAL_STORE_BYTE killed %7:vreg_64, killed %4:vgpr_32, 0, 0, implicit $exec :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
SI_RETURN The insn starting with %3 would trigger the assert were it not for the new lines of code. |
9419956
to
0602c5d
Compare
The latest commit primarily fixes problems that occur when -global-isel is used. Problems with both incoming i1 args and i1 return values are fixed. |
unsigned CopyToBits = 32; | ||
|
||
// When function return type is i1, it may be in a 64b register. | ||
if (VA.getLocVT().getSizeInBits() == 1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getLocVT() == MVT::i1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure.
|
||
// When function return type is i1, it may be in a 64b register. | ||
if (VA.getLocVT().getSizeInBits() == 1) { | ||
if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might as well just directly check the wave size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To get wave size, in this function you need to get the MachineFunction from the MIRBuilder, then get the subtarget, and then downcast the subtarget to GCNSubtarget, and then get the wavesize. So there's also some indirections. I can make the change if you really think it's better to use wavesize.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are more indirections going on in the physreg lookup (plus going through MRI to get TRI). It's better to just query the wavesize
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok then!
if (VA.getLocVT().getSizeInBits() == 1 && | ||
MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ditto
|
||
if (VA.getLocVT().getSizeInBits() == 1 && | ||
MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) { | ||
ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There shouldn't be an extension here, it should be a direct assignment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK!
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, | ||
|
||
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>, | ||
|
||
CCIfType<[i1], CCPromoteToType<i32>>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should only promote for inreg
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When CC_AMDGPU_Custom_I1 fails, i.e., an i1 arg is not allocated to an SGPR. If the i1 is not promoted to i32, we end up with a COPY from a 32b VGPR to a 64b VGPR, which is rejected by SIInstrInfo::copyPhysReg():
renamable $vgpr0_vgpr1 = COPY killed renamable $vgpr0, implicit $exec
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But I would still expect the first action to be covered by CCIfInReg
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new CC function, CC_AMDGPU_Custom_I1, needs to have precedence over CCIfInReg. Otherwise, an "inreg i1" would be allocated to a 32b SGPR even if on a wavesize64 GPU.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
which you can do? Something like CCIfType<[i1], CCIfInReg<CCPromoteToType>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the promotion needs to be done regardless of inreg. The problem is as follows:
foo(a bunch of args, i1 %argX, ...)
If the bunch of args (i1 or inreg) before argX takes all the SGPRs, then argX is going to get a VGPR regardless of inreg or not. If it's not promoted to i32, we end up with the COPY from 32b VGPR to 64b VGPR as shown above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, inreg is changing the behavior of i1. The register decision is not a function of how many arguments are used.
What I want is "i1" -> SGPR pair. "i1 inreg" -> promoted to 32-bit SGPR
if (!SrcReg.isVirtual() && | ||
TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) { | ||
// When calling convention allocates SGPR for i1, for GPUs with | ||
// wavefront size 64, i1 return value is put in 64b SGPR. | ||
assert(ST->isWave64()); | ||
continue; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you post the actual MIR you are getting? You really should not touch any code in SILowerI1Copies, you do not have the correct context here to know if you're doing something sensible.
callseq_end isn't associated with a value, it's just the SP modification and provides ordering. It's disconnected from the passed register value. In this context, the raw register copy is uniform, but the logical value is divergent. Coming of of the DAG for an incoming value, I would expect you to end up with something like:
%0:sreg_64_exec = COPY $sgpr0_sgpr1 # This ABI copy should exist as-is
%1:vreg_1 = COPY %0 # SILowerI1Copies is supposed to deal with this
For the outgoing side, I would expect:
%2:vreg_1 = ...
%3:sreg_64_exec = COPY %2
$sgpr0_sgpr1 = COPY %3 # Untouchable ABI copy
else | ||
llvm_unreachable("Unexpected register class in LowerFormalArguments!"); | ||
else { | ||
if (VT == MVT::i1 && Subtarget->isWave64()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't need to make this conditional on wave64, it's essentially checking the same thing twice
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Complete MIR can be found above. The two relevant lines are:
%3:vreg_1 = COPY $sgpr0_sgpr1
%5:sreg_64_xexec = COPY %3:vreg_1
Note that the dest reg for the COPY from $sgpr0_sgpr1 is vreg_1, not sreg_64_xexec. This insn will trigger the assert in SILowerI1Copies. The patch checks for this and skip the insn to avoid the assert.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To operate within the constraints of SILowerI1Copies, we might need to find a way to get this broken into:
%3:sreg_64_xexec = COPY $sgpr0_sgpr1
%4:vreg_1 = COPY %3
%5:sreg_64_xexec = COPY %4
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pls see my comment on SILowerI1Copies.cpp below.
|
||
// When function return type is i1, it may be in a 64b register. | ||
if (VA.getLocVT().getSizeInBits() == 1) { | ||
if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are more indirections going on in the physreg lookup (plus going through MRI to get TRI). It's better to just query the wavesize
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, | ||
|
||
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>, | ||
|
||
CCIfType<[i1], CCPromoteToType<i32>>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But I would still expect the first action to be covered by CCIfInReg
else | ||
llvm_unreachable("Unexpected register class in LowerFormalArguments!"); | ||
else { | ||
if (VT == MVT::i1 && Subtarget->isWave64()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To operate within the constraints of SILowerI1Copies, we might need to find a way to get this broken into:
%3:sreg_64_xexec = COPY $sgpr0_sgpr1
%4:vreg_1 = COPY %3
%5:sreg_64_xexec = COPY %4
if (!Subtarget->enableFlatScratch()) { | ||
if (IsChainCallConv) | ||
CCInfo.AllocateRegBlock( | ||
ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AllocateReg works if you pass the register tuple value. It iterates all aliases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you suggesting the following?
static const MCPhysReg RegList[] = {AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51};
CCInfo.AllocateReg(RegList);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I mean AllocateReg(AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51)
0602c5d
to
822d2cd
Compare
@arsenm Code has been updated based on your review. Pls take a look again. Some of the test files have been updated, but there are about 20 remaining. |
unsigned CopyToBits = 32; | ||
|
||
// When function return type is i1, it may be in a 64b register. | ||
if (VA.getLocVT() == MVT::i1) { | ||
if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64()) | ||
CopyToBits = 64; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can all be in the initialization for CopyToBits
@@ -233,6 +241,13 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { | |||
void assignValueToReg(Register ValVReg, Register PhysReg, | |||
const CCValAssign &VA) override { | |||
MIB.addUse(PhysReg, RegState::Implicit); | |||
|
|||
if (VA.getLocVT() == MVT::i1 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the i1 inreg case, do you get getLocVT == MVT::i32? that should still hit the extendRegisterMin32 path
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, | ||
|
||
CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>, | ||
|
||
CCIfType<[i1], CCPromoteToType<i32>>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, inreg is changing the behavior of i1. The register decision is not a function of how many arguments are used.
What I want is "i1" -> SGPR pair. "i1 inreg" -> promoted to 32-bit SGPR
.isWave64(); | ||
|
||
static const MCPhysReg I1RegList1[] = { | ||
AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR2_SGPR3, AMDGPU::SGPR4_SGPR5, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you have to unroll them into the 32-bit pieces and reassemble the virtual register
if (!SrcReg.isVirtual() && | ||
TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) { | ||
// When calling convention allocates SGPR for i1, for GPUs with | ||
// wavefront size 64, i1 return value is put in 64b SGPR. | ||
assert(ST->isWave64()); | ||
continue; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still think that both:
- The code already in SILowerI1Copies is nonsense which should be fixed
- You shouldn't need to touch it
I think the code that's reaching here isn't what you want. You want to have the original copy be the copy to the SReg_64_xexec/SReg_32_xexec, and a copy from there to VReg_1. You have the opposite copy pair here
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17 | ||
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inreg case shouldn't change
@arsenm In one of your comments, you mentioned promoting "inreg i1" before it's being allocated. This is done in the latest commit. Note that, on a wavefront64 GPU, an "inreg i1" argument would be allcoated to a 32b sgpr while an ordinary i1 arg would be allocated to a 64b sgpr. Is this what you had in mind? Also I'm waiting for your further comments on SILowerI1Copies. |
Yes. That is i1 inreg is not interpreted as a boolean, i1 is |
Thanks @arsenm What about SILowerI1Copies? |
There should be no changes there. It cannot possibly have the context necessary to do anything sensible. The code there is nonsense as it is |
b2a3304
to
ab2fde2
Compare
value (2) a problem in AMDCallingConv.td when no sgprs are available.
being allocated.
for the i1 return value.
GlobalISel is used (3) zeroext/signext in i1 return is ignored (4) inreg return of i1 is treated as i32 (5) new test files.
comments in 2 new test files.
however, we need to set the register class to avoid later problems with instruction selection (2) for outgoing i1, do not differenciate between wavesize32 and wavesize64.
876e093
to
6f2289b
Compare
MRI.setRegClass(ValVReg, MIRBuilder.getMF() | ||
.getSubtarget<GCNSubtarget>() | ||
.getRegisterInfo() | ||
->getBoolRC()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't need to introduce a register class here, RegBankSelect should set it later
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I dont' do that, RegBankSelect will produce undesirable results in at least the following 2 cases:
Case 1: i1 arg is stored to memory.
After IRTranslator:
%0:_(s1) = COPY $sgpr4_sgpr5
%1:_(p1) = G_IMPLICIT_DEF
G_STORE %0:_(s1), %1:_(p1) :: (store (s1) into
`ptr addrspace(1) undef`, addrspace 1)
After RegBankSelect:
%0:sgpr(s1) = COPY $sgpr4_sgpr5
%1:sgpr(p1) = G_IMPLICIT_DEF
%2:sgpr(s32) = G_ZEXT %0:sgpr(s1)
%5:vgpr(s32) = COPY %2:sgpr(s32) // result of ZEXT
// copied to a vgpr
%6:vgpr(p1) = COPY %1:sgpr(p1)
G_STORE %5:vgpr(s32), %6:vgpr(p1) :: (store (s8)
into `ptr addrspace(1) undef`, addrspace 1)
After InstructionSelect:
%0:sreg_32 = COPY $sgpr4_sgpr5 //will cause "illegal copy" later
%1:sreg_64 = IMPLICIT_DEF
%2:sreg_32 = S_AND_B32 %0:sreg_32, 1, implicit-def dead $scc
%5:vgpr_32 = COPY %2:sreg_32
%6:vreg_64 = COPY %1:sreg_64
GLOBAL_STORE_BYTE %6:vreg_64, %5:vgpr_32, 0, 0, implicit $exec
:: (store (s8) into `ptr addrspace(1) undef`, addrspace 1)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Case2: i1 arg used in a branch:
After IRTranslator:
%0:_(s1) = COPY $sgpr4_sgpr5
%1:_(s1) = G_CONSTANT i1 true
%5:_(s32) = G_CONSTANT i32 0
%6:_(p1) = G_GLOBAL_VALUE @static.gv2
%9:_(p1) = G_GLOBAL_VALUE @static.gv0
%2:_(s1) = G_XOR %0:_, %1:_
%3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS
intrinsic(@llvm.amdgcn.if), %2:_(s1)
G_BRCOND %3:_(s1), %bb.4
After RegBankSelect:
%0:sgpr(s1) = COPY $sgpr4_sgpr5
%20:sgpr(s32) = G_CONSTANT i32 1
%1:sgpr(s1) = G_TRUNC %20:sgpr(s32)
%21:vcc(s1) = COPY %0:sgpr(s1)
%22:vcc(s1) = COPY %1:sgpr(s1)
%2:sreg_64_xexec(s1) = G_XOR %21:vcc, %22:vcc
%4:sreg_64_xexec(s64) = SI_IF %2:sreg_64_xexec(s1), %bb.2,
implicit-def $exec, implicit-def $scc, implicit $exec
After InstructionSelect:
%0:sreg_32 = COPY $sgpr4_sgpr5 // will cause "illegal copy"
%29:sreg_32 = S_AND_B32 1, %0:sreg_32, implicit-def
dead $scc
%21:sreg_64_xexec = V_CMP_NE_U32_e64 0, %29:sreg_32,
implicit $exec
%22:sreg_64_xexec = S_MOV_B64 -1
%2:sreg_64_xexec = S_XOR_B64 %21:sreg_64_xexec,
%22:sreg_64_xexec, implicit-def dead $scc
%4:sreg_64_xexec = SI_IF %2:sreg_64_xexec, %bb.2,
implicit-def $exec, implicit-def $scc, implicit $exec
S_BRANCH %bb.4
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Setting reg class when lowing the i1 arg works in both cases above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The correct way to fix this is to fix RegBankSelect's mapping for COPY with s1 from SGPR physregs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Modified AMDGPURegisterBankInfo::getInstrMapping()
to set reg bank for i1 arg, instead of setting reg class in IRTranslator.
/// TODO: Investigate if reserving ScratchRSrcReg can be moved to calling conv | ||
/// functions. If so, then this function is not needed anymore -- we can just | ||
/// use CallLowering::determineAndHandleAssignments() as before. | ||
bool AMDGPUCallLowering::determineAndHandleAssignmentsLocal( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of creating determineAndHandleAssignmentsLocal, would be better to just add an overload of determineAndHandleAssignments in the base class that takes the CCInfo as an argument. IIRC the DAG has a similar split
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An overloaded function with a CCState argument has been created in the base class CallLowering
.
AMDGPU::SGPR28, AMDGPU::SGPR29}; | ||
|
||
assert(LocVT == MVT::i1); | ||
if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you avoid the determineAndHandleAssignments changes by using drop_front if !enableFlatScratch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we use drop_front, doesn't that mean we are assuming that the reserved reg is s[0:3]? How about putting in here something like the following:
if (!Subtarget.enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, could also do that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Decided to create an overload function determineAndHandleAssignments()
instead.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See above, might as well just use the 2 pieces of determineAndHandleAssignments instead
if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) { | ||
const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg); | ||
if (DstRC) | ||
return DstRC->contains(SrcReg); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you split this into a separate change, with MIR tests? I suspect you can just directly return true.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be split to a separate change
if (N3->getOpcode() != ISD::CALLSEQ_END) | ||
return false; | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Return bool expression
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -15887,7 +15889,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) { | |||
SDNode *N2 = N1->getOperand(0).getNode(); | |||
if (N2->getOpcode() != ISD::CopyFromReg) | |||
return false; | |||
SDNode *N3 = N2->getOperand(0).getNode(); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping here, if UniformityAnalysis was correct I don't think you need this
a CCState argument.
@@ -407,6 +407,12 @@ class CallLowering { | |||
CallingConv::ID CallConv, bool IsVarArg, | |||
ArrayRef<Register> ThisReturnRegs = std::nullopt) const; | |||
|
|||
bool determineAndHandleAssignments( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that I think of it, I think it would be better to just use separate determineAssignments, and handleAssignments. determineAndHandleAssignments is already the simple case where the CCState doesn't need any custom handling
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still should undo this and just use the 2 separate pieces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. See commit df3a52e
AMDGPU::SGPR28, AMDGPU::SGPR29}; | ||
|
||
assert(LocVT == MVT::i1); | ||
if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See above, might as well just use the 2 pieces of determineAndHandleAssignments instead
if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) { | ||
const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg); | ||
if (DstRC) | ||
return DstRC->contains(SrcReg); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be split to a separate change
// For i1 function arguments, the call of getRegBank() currently gives | ||
// incorrect result. We set both src and dst banks to VCCRegBank. | ||
if (!MI.getOperand(1).getReg().isVirtual() && | ||
MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(1)) { | ||
DstBank = SrcBank = &AMDGPU::VCCRegBank; | ||
} | ||
|
||
// For i1 return value, the dst reg is an SReg but we need to set the reg | ||
// bank to VCCRegBank. | ||
if (!MI.getOperand(0).getReg().isVirtual() && | ||
SrcBank == &AMDGPU::VCCRegBank) | ||
DstBank = SrcBank; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also make this a separate PR (I want 2, one for the RegBankSelect and one for the InstructionSelect parts)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you saying that the changes in AMDGPURegisterBankInfo.cpp
should be a separate PR? Without those changes, I think some testcases will fail.
@@ -15887,7 +15889,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) { | |||
SDNode *N2 = N1->getOperand(0).getNode(); | |||
if (N2->getOpcode() != ISD::CopyFromReg) | |||
return false; | |||
SDNode *N3 = N2->getOperand(0).getNode(); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this whole function should be unnecessary. isArgPassedInSGPR needs to be updated, and the use for uniformity needs to be updated to account for this non-uniform, but also in SGPR case
ensure that the COPY instruction generated later on has sreg_64 as the the destnation reg class instead of vreg_1. This commit fixes that CopyFromReg node such that it uses the physical register assigned to the return value instead of the new virtual register. The purpose of doing this is to avoid having to create the function isCopyFromRegForI1Return() that is used in SITargetLowering::isSDNodeSourceOfDivergence() to get around the assert therein.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should split out the pieces I mentioned and needs to update UniformityAnalysis hooks
@@ -407,6 +407,12 @@ class CallLowering { | |||
CallingConv::ID CallConv, bool IsVarArg, | |||
ArrayRef<Register> ThisReturnRegs = std::nullopt) const; | |||
|
|||
bool determineAndHandleAssignments( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still should undo this and just use the 2 separate pieces
// For i1 function arguments, the call of getRegBank() currently gives | ||
// incorrect result. We set both src and dst banks to VCCRegBank. | ||
if (!MI.getOperand(1).getReg().isVirtual() && | ||
MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(1)) { | ||
DstBank = SrcBank = &AMDGPU::VCCRegBank; | ||
} | ||
|
||
// For i1 return value, the dst reg is an SReg but we need to set the reg | ||
// bank to VCCRegBank. | ||
if (!MI.getOperand(0).getReg().isVirtual() && | ||
SrcBank == &AMDGPU::VCCRegBank) | ||
DstBank = SrcBank; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping
CallLowering::determineAndHandleAssignments(). Instead, call determineAssignments() and handleAssignments() separately.
Currently i1 arguments are passed as 32-bit VGPRs. It would make more sense to make use of SGPRs and pass these values as a wavesize bool mask.