[AMDGPU] Allocate i1 argument to SGPRs #72461

jwanggit86 · 2023-11-16T01:54:18Z

Currently i1 arguments are passed as 32-bit VGPRs. It would make more sense to make use of SGPRs and pass these values as a wavesize bool mask.

llvmbot · 2023-11-16T01:54:34Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

Currently i1 arguments are passed as 32-bit VGPRs. It would make more sense to make use of SGPRs and pass these values as a wavesize bool mask.

Full diff: https://github.com/llvm/llvm-project/pull/72461.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td (+4-1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+13)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+23)
(modified) llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp (+6)
(added) llvm/test/CodeGen/AMDGPU/z_callee.ll (+33)
(added) llvm/test/CodeGen/AMDGPU/z_caller.ll (+43)
(added) llvm/test/CodeGen/AMDGPU/z_caller2.ll (+57)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bcb4..3f18cbd661d5b26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -185,9 +185,12 @@ def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 // Calling convention for leaf functions
 def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
-  CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
+  CCIfType<[i1] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>,
+
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4681004d3ba74ff..800f6cb84a806c7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3468,6 +3468,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
 
+  // In code below (after call of AnalyzeCallOperands),
+  // if (!Subtarget->enableFlatScratch()), it would use either s[48:51] or
+  // s[0:3]. Therefore, before calling AnalyzeCallOperands, we may need to
+  // reserve these registers.
+  if (!Subtarget->enableFlatScratch()) {
+    if (IsChainCallConv)
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}, 4);
+    else
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{
+          AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+  }
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0b11fd5f757cbe0..ac122107661f15e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -852,6 +852,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      // When calling convention allocates SGPR for i1 argument, we may
+      // have a SRPR_64 to SReg_32 copy for an outgoing i1 argument. Adjust
+      // the copy to avoid illegal copy.
+      if (AMDGPU::SGPR_64RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(SrcReg, AMDGPU::sub0);
+        if (sub0 != DestReg)
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg).addReg(sub0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -885,6 +895,19 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
 
     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      // When an i1 argument is allocated to an SGPR_32, we may have a COPY
+      // from SGPR_32 to SReg_64. The following handles this case to avoid
+      // an illegal copy.
+      if(AMDGPU::SGPR_32RegClass.contains(SrcReg)) {
+        auto sub0 = RI.getSubReg(DestReg, AMDGPU::sub0);
+        if (sub0 != SrcReg) {
+          BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), sub0).addReg(SrcReg);
+        }
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                RI.getSubReg(DestReg, AMDGPU::sub1)).addImm(0);
+        return;
+      }
+
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index a11f02e7db3504b..0817a2b164460a0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -503,6 +503,12 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      // When the calling convention allocates i1 argument to SGPR,
+      // we may have a COPY with dst being an SGPR_32. This should
+      // not be lowered into V_CNDMASK_B32.
+      if(AMDGPU::SGPR_32RegClass.contains(DstReg))
+        continue;
+
       Changed = true;
 
       // Copy into a 32-bit vector register.
diff --git a/llvm/test/CodeGen/AMDGPU/z_callee.ll b/llvm/test/CodeGen/AMDGPU/z_callee.ll
new file mode 100644
index 000000000000000..2fc4befa279f3e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_callee.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; For CIGFX89, the i1 arg is passed in s4, but the v_cndmask insn uses s[4:5].
+; Therefore, the "s_mov_b32 s5, 0" is generated.
+;
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s5, 0
+; CIGFX89-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller.ll b/llvm/test/CodeGen/AMDGPU/z_caller.ll
new file mode 100644
index 000000000000000..faf25e407fca2c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1(i1) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+  call void @external_void_func_i1(i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/z_caller2.ll b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
new file mode 100644
index 000000000000000..e63ae50b7e91cd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/z_caller2.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+
+declare hidden void @external_void_func_i1_signext(i1 signext) #0
+
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    s_endpgm
+  %var = load volatile i1, ptr addrspace(1) undef
+  call void @external_void_func_i1_signext(i1 signext %var)
+  ret void
+}
+
+
+
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

github-actions · 2023-11-16T01:56:49Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jwanggit86 · 2023-11-16T02:57:12Z

Some explanations. First the calling convention is changed such that i1 arg is allocated to SGPR_32. Then 2 problems have to be fixed to prevent compile errors: (1) on the callee side, handle SGPR_32 to SGPR_64 copies by copying the SGPR_32 to the lower 32b of the SGPR_64, and set the higher 32b to 0 (2) on the caller side, handle SGPR_64 to SGPR_32 copies by ignoring the higher 32b of the SGPR_64.

I'm not sure if there are any potential problems with all the code changes and can't help thinking there might be a better solution. So, I'd like to have some comments before going any further. Thanks!

perlfu

Needs test for i1 return values?

Patch doesn't seem wave size sensitive as it just attempts to fix up COPYs from s32 to s64 and vice versa -- but based on the test these COPYs seem to represent a deeper problem that is not being addressed.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/test/CodeGen/AMDGPU/z_callee.ll

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

jwanggit86 · 2023-12-01T02:22:52Z

The latest solution is quite different from the previous one. For calling convention, a custom function is created for i1 arg/return, which allocates i1 to either a 32b SGPR for wavesize32 or a 64b SGPR for wavesize64.

Two additional changes are made: (1) in LowerFormalArgument(), previously it's assumed the all registers are from a 32b regclass. Now it's possible that it is from a 64b regclass. (2) In lowerCopiesToI1(), for i1 return value, previously it is assumed it's only from a 32b register. Now it can be from a 64b register.

arsenm

The tests can be in one file (and possibly should just go in the existing ABI tests). They also need quite a few more tests:

Test i1 vectors and i1 arrays for a few different sizes
Test the case where the SGPR argument list will run out. In particular, test the case where a wave64 argument needs to straddle the boundary
Test some mixed other inreg arguments, such that the alignment requirement for the boolean uses matters. e.g. a select user will require even aligned registers, but the argument will be odd

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

arsenm · 2023-12-01T02:53:24Z

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+


You don't know this is the context at this point. I don't think you should need to touch this. I assume you encountered a problematic copy here, what exactly does it look like?

Immediate after these new lines is an assert():
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
which makes sure SrcReg is 32b before lowering the copy to V_CMP_NE_U32_e64.

Because, with the callingconv change, the i1 return value is in an SGPR_64 (for wave64 GPUs), these lines were inserted to avoid triggering the assert.

But contextually these shouldn't be getting processed. What is the MIR at the point of the assert?

The IR instruction callseq_end is always divergent. As a result, the CopyFromReg to copy the return value is also divergent. When the return type is i1, in the machine instruction, the dest reg would be vreg_1. In the above code in SILoweringI1Copies, we avoid lowing this COPY to avoid triggering the assert (code following assert assumes the src is a 32b phys reg).

Can you post the actual MIR you are getting? You really should not touch any code in SILowerI1Copies, you do not have the correct context here to know if you're doing something sensible.

callseq_end isn't associated with a value, it's just the SP modification and provides ordering. It's disconnected from the passed register value. In this context, the raw register copy is uniform, but the logical value is divergent. Coming of of the DAG for an incoming value, I would expect you to end up with something like:

%0:sreg_64_exec = COPY $sgpr0_sgpr1 # This ABI copy should exist as-is %1:vreg_1 = COPY %0 # SILowerI1Copies is supposed to deal with this

For the outgoing side, I would expect:

%2:vreg_1 = ... %3:sreg_64_exec = COPY %2 $sgpr0_sgpr1 = COPY %3 # Untouchable ABI copy

I still think that both:

The code already in SILowerI1Copies is nonsense which should be fixed

You shouldn't need to touch it

I think the code that's reaching here isn't what you want. You want to have the original copy be the copy to the SReg_64_xexec/SReg_32_xexec, and a copy from there to VReg_1. You have the opposite copy pair here

The code already in SILowerI1Copies is nonsense which should be fixed

Maybe create a separate ticket?

You shouldn't need to touch it

I don't see a better alternative. Do you have any suggestions?

Here is a recap of the full story. I'm listing the .ll file, the init and final DAGs, the instruction selection results and the machine code.

// .ll file define i1 @i1_func_void() { %val = load i1, ptr addrspace(1) undef ret i1 %val } define void @test_call_external_i1_func_void() { %val = call i1 @i1_func_void() store volatile i1 %val, ptr addrspace(1) undef ret void } // 1. Initial DAG Initial selection DAG: %bb.0 'test_call_external_i1_func_void:' SelectionDAG has 17 nodes: ... t9: ch,glue = CALL t6, GlobalAddress:i64<ptr @i1_func_void> 0, TargetGlobalAddress:i64<ptr @i1_func_void> 0, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, RegisterMask:Untyped, t6:1 t10: ch,glue = callseq_end # D:1 t9, TargetConstant:i64<0>, TargetConstant:i64<0>, t9:1 - t12: i1,ch,glue = CopyFromReg # D:1 t10, Register:i1 $sgpr0_sgpr1, t10:1 t14: i64 = Constant<0> ... + // Note that CopyFromReg is divergent // 2. Final DAG Optimized legalized selection DAG: %bb.0 'test_call_external_i1_func_void:' SelectionDAG has 21 nodes: ... t10: ch,glue = callseq_end # D:1 t9, TargetConstant:i64<0>, TargetConstant:i64<0>, t9:1 - t12: i1,ch,glue = CopyFromReg # D:1 t10, Register:i1 $sgpr0_sgpr1, t10:1 t29: i32 = zero_extend # D:1 t12 ... // 3. Instruction selection ===== Instruction selection ends: Selected selection DAG: %bb.0 'test_call_external_i1_func_void:' SelectionDAG has 23 nodes: ... t10: i1,ch,glue = ADJCALLSTACKDOWN TargetConstant:i64<0>, TargetConstant:i64<0>, t9, t9:1 - t12: i1,ch,glue = CopyFromReg # D:1 t10:1, Register:i1 $sgpr0_sgpr1, t10:2 t29: i32 = V_CNDMASK_B32_e64 # D:1 TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<1>, t12 t27: ch = GLOBAL_STORE_BYTE<Mem:(volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)> # D:1 IMPLICIT_DEF:i64, t29, TargetConstant:i16<0>, TargetConstant:i32<0>, t12:1 ... // 4. Machine code at end of ISel *** MachineFunction at end of ISel *** # Machine code for function test_call_external_i1_func_void: IsSSA, TracksLiveness ... ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc - %3:vreg_1 = COPY $sgpr0_sgpr1 - %5:sreg_64_xexec = COPY %3:vreg_1 %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5:sreg_64_xexec, implicit $exec %6:sreg_64 = IMPLICIT_DEF %7:vreg_64 = COPY %6:sreg_64 GLOBAL_STORE_BYTE killed %7:vreg_64, killed %4:vgpr_32, 0, 0, implicit $exec :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1) ...

In InstrEmitter::EmitCopyFromReg() the CopyFromReg insn transformed to the following:

%3:vreg_1 = COPY $sgpr0_sgpr1 %5:sreg_64_xexec = COPY %3:vreg_1

If I understand you correctly, I think you want the "vreg_1" in %3 to be "sreg_64_xexec". The relevant code of EmitCopyFromReg() is as follows:

void InstrEmitter::EmitCopyFromReg(...) ... // Stick to the preferred register classes for legal types. if (TLI->isTypeLegal(VT)) - UseRC = TLI->getRegClassFor(VT, Node->isDivergent()); // UseRC is set to vreg_1 here ... const TargetRegisterClass *RC = nullptr; if (i + II.getNumDefs() < II.getNumOperands()) { RC = TRI->getAllocatableClass( TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF)); } - if (!UseRC) // UseRC is already set, so it's not overwritten here UseRC = RC; ... if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { assert(TRI->isTypeLegalForClass(*UseRC, VT) && "Incompatible phys register def and uses!"); - DstRC = UseRC; // DstRC is set to UseRC, which is vreg_1 } else DstRC = SrcRC; ...

As you can see, because UseRC is set to vreg_1 near the beginning of the function, the final DstRC is set to vreg_1 as well.

The code for SITargetLowering::getRegClassFor() is as follows:

const TargetRegisterClass *SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) // Returns SReg_64 only for non-divergent SDNode. return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; if (!TRI->isSGPRClass(RC) && !isDivergent) return TRI->getEquivalentSGPRClass(RC); else if (TRI->isSGPRClass(RC) && isDivergent) return TRI->getEquivalentVGPRClass(RC); return RC; }

I don't think changing the behavior of SITargetLowering::getRegClassFor() is a good idea. So, I think the current solution, i.e. skipping the instruction, %3:sreg_64 = COPY $sgpr0_sgpr1 in SILowerI1Copies is preferred.

Pls let me know if you have any suggestions.

@arsenm Any suggestions on SILowerI1Copies?

The virtual register copies are getting emitted backwards, yes.

You could hack around this in LowerFormalArguments by manually copying to a virtual register with the correct class, and then emitting a copy from that virtual register.

This part is for i1 return value though, not i1 arguments.

llvm/test/CodeGen/AMDGPU/z_caller2.ll

llvm/test/CodeGen/AMDGPU/z_return.ll

arsenm · 2023-12-01T02:59:14Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+                             .isWave64();
+
+  static const MCPhysReg I1RegList1[] = {
+      AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,


Do we want to only pass in even aligned registers? We could also include all the aliasing odd-based cases

By "odd-based cases" do you mean something like SGPR1_SGPR2? Do you have an example where such reg allocation is used?

Yes. I mean if you had (i32 inreg %arg0, i64 inreg %arg1), you could use arg0=s0, arg1=s[1:2]

This issue deals with i1 arg/return only. Inreg args are handled separately. If you are thinking about a mixed case as follows:

foo (i32 inreg %arg0, i1 %arg1)

Currently for gfx900, %arg0 is assigned to s4, and %arg1 to s[6:7]. If in this case you want %arg1 to be given s[5:6], I suppose the list of registers can be changed from { ... sgpr4_sgpr5, sgpr6_sgpr7, ...} to {... sgpr4_sgpr5, sgpr5_sgrp6, sgpr6_sgrp7, ...}.
Is this what you had in mind?

@arsenm Pls let me know if the above is what you wanted.

Yes, the i64 is physically what is being passed, but doesn't have the alignment requirement in the argument list.

It seems pairs of SGPRs that start with an odd number, e.g., SGPR5_SGPR6, are
not defined. Only pairs like SGPR4_SGPR5 are defined.

Yes, you have to unroll them into the 32-bit pieces and reassemble the virtual register

arsenm

Also needs some AMDGPUUsage changes in the ABI sections

arsenm · 2023-12-05T02:17:50Z

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+


But contextually these shouldn't be getting processed. What is the MIR at the point of the assert?

jwanggit86 · 2023-12-05T18:24:35Z

Here is the machine code of the caller in z_return.ll at the beginning of SILowerI1Copies:

bb.0 (%ir-block.0):
  ADJCALLSTACKUP 0, 0, implicit-def dead $scc
  %0:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @i1_func_void, target-flags(amdgpu-gotprel32-hi) @i1_func_void, implicit-def dead $scc
  %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %0:sreg_64, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
  %2:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
  $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %2:sgpr_128
  SI_CALL_ISEL killed %1:sreg_64_xexec, @i1_func_void, , implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr0_sgpr1
  ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
  %3:vreg_1 = COPY $sgpr0_sgpr1  // srcReg is 64b
  %5:sreg_64_xexec = COPY %3:vreg_1
  %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5:sreg_64_xexec, implicit $exec
  %6:sreg_64 = IMPLICIT_DEF
  %7:vreg_64 = COPY %6:sreg_64
  GLOBAL_STORE_BYTE killed %7:vreg_64, killed %4:vgpr_32, 0, 0, implicit $exec :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1)
  SI_RETURN

The insn starting with %3 would trigger the assert were it not for the new lines of code.

jwanggit86 · 2023-12-22T00:09:42Z

The latest commit primarily fixes problems that occur when -global-isel is used. Problems with both incoming i1 args and i1 return values are fixed.

arsenm · 2023-12-22T06:40:18Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+      unsigned CopyToBits = 32;
+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {


getLocVT() == MVT::i1

arsenm · 2023-12-22T06:40:41Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {
+        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)


Might as well just directly check the wave size

To get wave size, in this function you need to get the MachineFunction from the MIRBuilder, then get the subtarget, and then downcast the subtarget to GCNSubtarget, and then get the wavesize. So there's also some indirections. I can make the change if you really think it's better to use wavesize.

There are more indirections going on in the physreg lookup (plus going through MRI to get TRI). It's better to just query the wavesize

arsenm · 2023-12-22T06:40:49Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+    if (VA.getLocVT().getSizeInBits() == 1 &&
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {


arsenm · 2023-12-22T06:41:24Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+
+    if (VA.getLocVT().getSizeInBits() == 1 &&
+        MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {
+      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);


There shouldn't be an extension here, it should be a direct assignment

arsenm · 2023-12-22T06:41:42Z

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

  CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,

+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
+  CCIfType<[i1], CCPromoteToType<i32>>,


Should only promote for inreg

When CC_AMDGPU_Custom_I1 fails, i.e., an i1 arg is not allocated to an SGPR. If the i1 is not promoted to i32, we end up with a COPY from a 32b VGPR to a 64b VGPR, which is rejected by SIInstrInfo::copyPhysReg():

renamable $vgpr0_vgpr1 = COPY killed renamable $vgpr0, implicit $exec

But I would still expect the first action to be covered by CCIfInReg

The new CC function, CC_AMDGPU_Custom_I1, needs to have precedence over CCIfInReg. Otherwise, an "inreg i1" would be allocated to a 32b SGPR even if on a wavesize64 GPU.

which you can do? Something like CCIfType<[i1], CCIfInReg<CCPromoteToType>

I think the promotion needs to be done regardless of inreg. The problem is as follows:

foo(a bunch of args, i1 %argX, ...)

If the bunch of args (i1 or inreg) before argX takes all the SGPRs, then argX is going to get a VGPR regardless of inreg or not. If it's not promoted to i32, we end up with the COPY from 32b VGPR to 64b VGPR as shown above.

No, inreg is changing the behavior of i1. The register decision is not a function of how many arguments are used.

What I want is "i1" -> SGPR pair. "i1 inreg" -> promoted to 32-bit SGPR

arsenm · 2023-12-22T06:54:53Z

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+


Can you post the actual MIR you are getting? You really should not touch any code in SILowerI1Copies, you do not have the correct context here to know if you're doing something sensible.

callseq_end isn't associated with a value, it's just the SP modification and provides ordering. It's disconnected from the passed register value. In this context, the raw register copy is uniform, but the logical value is divergent. Coming of of the DAG for an incoming value, I would expect you to end up with something like:

%0:sreg_64_exec = COPY $sgpr0_sgpr1 # This ABI copy should exist as-is %1:vreg_1 = COPY %0 # SILowerI1Copies is supposed to deal with this

For the outgoing side, I would expect:

%2:vreg_1 = ... %3:sreg_64_exec = COPY %2 $sgpr0_sgpr1 = COPY %3 # Untouchable ABI copy

arsenm · 2023-12-22T06:55:37Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

-    else
-      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    else {
+      if (VT == MVT::i1 && Subtarget->isWave64())


You don't need to make this conditional on wave64, it's essentially checking the same thing twice

Complete MIR can be found above. The two relevant lines are:

%3:vreg_1 = COPY $sgpr0_sgpr1 %5:sreg_64_xexec = COPY %3:vreg_1

Note that the dest reg for the COPY from $sgpr0_sgpr1 is vreg_1, not sreg_64_xexec. This insn will trigger the assert in SILowerI1Copies. The patch checks for this and skip the insn to avoid the assert.

To operate within the constraints of SILowerI1Copies, we might need to find a way to get this broken into:

%3:sreg_64_xexec = COPY $sgpr0_sgpr1 %4:vreg_1 = COPY %3 %5:sreg_64_xexec = COPY %4

Pls see my comment on SILowerI1Copies.cpp below.

arsenm · 2024-01-17T09:19:24Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT().getSizeInBits() == 1) {
+        if (MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64)


There are more indirections going on in the physreg lookup (plus going through MRI to get TRI). It's better to just query the wavesize

arsenm · 2024-01-17T09:20:14Z

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

  CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,

+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
+  CCIfType<[i1], CCPromoteToType<i32>>,


But I would still expect the first action to be covered by CCIfInReg

arsenm · 2024-01-17T09:21:37Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

-    else
-      llvm_unreachable("Unexpected register class in LowerFormalArguments!");
+    else {
+      if (VT == MVT::i1 && Subtarget->isWave64())


To operate within the constraints of SILowerI1Copies, we might need to find a way to get this broken into:

%3:sreg_64_xexec = COPY $sgpr0_sgpr1 %4:vreg_1 = COPY %3 %5:sreg_64_xexec = COPY %4

arsenm · 2024-01-17T09:22:04Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+  if (!Subtarget->enableFlatScratch()) {
+    if (IsChainCallConv)
+      CCInfo.AllocateRegBlock(
+          ArrayRef<MCPhysReg>{AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50,


AllocateReg works if you pass the register tuple value. It iterates all aliases

Are you suggesting the following?

static const MCPhysReg RegList[] = {AMDGPU::SGPR48, AMDGPU::SGPR49, AMDGPU::SGPR50, AMDGPU::SGPR51}; CCInfo.AllocateReg(RegList);

No, I mean AllocateReg(AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51)

jwanggit86 · 2024-02-05T22:06:32Z

@arsenm Code has been updated based on your review. Pls take a look again. Some of the test files have been updated, but there are about 20 remaining.

arsenm · 2024-02-06T10:04:37Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+      unsigned CopyToBits = 32;
+
+      // When function return type is i1, it may be in a 64b register.
+      if (VA.getLocVT() == MVT::i1) {
+        if (MIRBuilder.getMF().getSubtarget<GCNSubtarget>().isWave64())
+          CopyToBits = 64;
+      }
+


This can all be in the initialization for CopyToBits

arsenm · 2024-02-06T10:09:20Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

@@ -233,6 +241,13 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
  void assignValueToReg(Register ValVReg, Register PhysReg,
                        const CCValAssign &VA) override {
    MIB.addUse(PhysReg, RegState::Implicit);
+
+    if (VA.getLocVT() == MVT::i1 &&


For the i1 inreg case, do you get getLocVT == MVT::i32? that should still hit the extendRegisterMin32 path

arsenm · 2024-02-06T10:12:08Z

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

  CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,

+  CCIfType<[i1] , CCCustom<"CC_AMDGPU_Custom_I1">>,
+
+  CCIfType<[i1], CCPromoteToType<i32>>,


No, inreg is changing the behavior of i1. The register decision is not a function of how many arguments are used.

What I want is "i1" -> SGPR pair. "i1 inreg" -> promoted to 32-bit SGPR

arsenm · 2024-02-06T10:13:20Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+                             .isWave64();
+
+  static const MCPhysReg I1RegList1[] = {
+      AMDGPU::SGPR0_SGPR1,   AMDGPU::SGPR2_SGPR3,   AMDGPU::SGPR4_SGPR5,


Yes, you have to unroll them into the 32-bit pieces and reassemble the virtual register

arsenm · 2024-02-06T10:27:26Z

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

+        if (!SrcReg.isVirtual() &&
+            TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 64) {
+          // When calling convention allocates SGPR for i1, for GPUs with
+          // wavefront size 64, i1 return value is put in 64b SGPR.
+          assert(ST->isWave64());
+          continue;
+        }
+


I still think that both:

The code already in SILowerI1Copies is nonsense which should be fixed

You shouldn't need to touch it

I think the code that's reaching here isn't what you want. You want to have the original copy be the copy to the SReg_64_xexec/SReg_32_xexec, and a copy from there to VReg_1. You have the opposite copy pair here

arsenm · 2024-02-06T10:28:55Z

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll

+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)


The inreg case shouldn't change

jwanggit86 · 2024-02-20T00:31:55Z

@arsenm In one of your comments, you mentioned promoting "inreg i1" before it's being allocated. This is done in the latest commit. Note that, on a wavefront64 GPU, an "inreg i1" argument would be allcoated to a 32b sgpr while an ordinary i1 arg would be allocated to a 64b sgpr. Is this what you had in mind?

Also I'm waiting for your further comments on SILowerI1Copies.

arsenm · 2024-02-28T06:54:05Z

@arsenm In one of your comments, you mentioned promoting "inreg i1" before it's being allocated. This is done in the latest commit. Note that, on a wavefront64 GPU, an "inreg i1" argument would be allcoated to a 32b sgpr while an ordinary i1 arg would be allocated to a 64b sgpr. Is this what you had in mind?

Yes. That is i1 inreg is not interpreted as a boolean, i1 is

jwanggit86 · 2024-02-29T03:08:46Z

Thanks @arsenm What about SILowerI1Copies?

arsenm · 2024-02-29T09:25:55Z

Thanks @arsenm What about SILowerI1Copies?

There should be no changes there. It cannot possibly have the context necessary to do anything sensible. The code there is nonsense as it is

jwanggit86 · 2024-02-29T22:44:58Z

Thanks @arsenm What about SILowerI1Copies?

There should be no changes there. It cannot possibly have the context necessary to do anything sensible. The code there is nonsense as it is

@arsenm So how do you want to proceed?

value (2) a problem in AMDCallingConv.td when no sgprs are available.

being allocated.

for the i1 return value.

GlobalISel is used (3) zeroext/signext in i1 return is ignored (4) inreg return of i1 is treated as i32 (5) new test files.

comments in 2 new test files.

however, we need to set the register class to avoid later problems with instruction selection (2) for outgoing i1, do not differenciate between wavesize32 and wavesize64.

arsenm · 2024-05-20T13:12:28Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+      MRI.setRegClass(ValVReg, MIRBuilder.getMF()
+                                   .getSubtarget<GCNSubtarget>()
+                                   .getRegisterInfo()
+                                   ->getBoolRC());


You shouldn't need to introduce a register class here, RegBankSelect should set it later

If I dont' do that, RegBankSelect will produce undesirable results in at least the following 2 cases:

Case 1: i1 arg is stored to memory.
After IRTranslator:

%0:_(s1) = COPY $sgpr4_sgpr5 %1:_(p1) = G_IMPLICIT_DEF G_STORE %0:_(s1), %1:_(p1) :: (store (s1) into `ptr addrspace(1) undef`, addrspace 1)

After RegBankSelect:

%0:sgpr(s1) = COPY $sgpr4_sgpr5 %1:sgpr(p1) = G_IMPLICIT_DEF %2:sgpr(s32) = G_ZEXT %0:sgpr(s1) %5:vgpr(s32) = COPY %2:sgpr(s32) // result of ZEXT // copied to a vgpr %6:vgpr(p1) = COPY %1:sgpr(p1) G_STORE %5:vgpr(s32), %6:vgpr(p1) :: (store (s8) into `ptr addrspace(1) undef`, addrspace 1)

After InstructionSelect:

%0:sreg_32 = COPY $sgpr4_sgpr5 //will cause "illegal copy" later %1:sreg_64 = IMPLICIT_DEF %2:sreg_32 = S_AND_B32 %0:sreg_32, 1, implicit-def dead $scc %5:vgpr_32 = COPY %2:sreg_32 %6:vreg_64 = COPY %1:sreg_64 GLOBAL_STORE_BYTE %6:vreg_64, %5:vgpr_32, 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) undef`, addrspace 1)

Case2: i1 arg used in a branch:
After IRTranslator:

%0:_(s1) = COPY $sgpr4_sgpr5 %1:_(s1) = G_CONSTANT i1 true %5:_(s32) = G_CONSTANT i32 0 %6:_(p1) = G_GLOBAL_VALUE @static.gv2 %9:_(p1) = G_GLOBAL_VALUE @static.gv0 %2:_(s1) = G_XOR %0:_, %1:_ %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) G_BRCOND %3:_(s1), %bb.4

After RegBankSelect:

%0:sgpr(s1) = COPY $sgpr4_sgpr5 %20:sgpr(s32) = G_CONSTANT i32 1 %1:sgpr(s1) = G_TRUNC %20:sgpr(s32) %21:vcc(s1) = COPY %0:sgpr(s1) %22:vcc(s1) = COPY %1:sgpr(s1) %2:sreg_64_xexec(s1) = G_XOR %21:vcc, %22:vcc %4:sreg_64_xexec(s64) = SI_IF %2:sreg_64_xexec(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec

After InstructionSelect:

%0:sreg_32 = COPY $sgpr4_sgpr5 // will cause "illegal copy" %29:sreg_32 = S_AND_B32 1, %0:sreg_32, implicit-def dead $scc %21:sreg_64_xexec = V_CMP_NE_U32_e64 0, %29:sreg_32, implicit $exec %22:sreg_64_xexec = S_MOV_B64 -1 %2:sreg_64_xexec = S_XOR_B64 %21:sreg_64_xexec, %22:sreg_64_xexec, implicit-def dead $scc %4:sreg_64_xexec = SI_IF %2:sreg_64_xexec, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec S_BRANCH %bb.4

Setting reg class when lowing the i1 arg works in both cases above.

The correct way to fix this is to fix RegBankSelect's mapping for COPY with s1 from SGPR physregs

Modified AMDGPURegisterBankInfo::getInstrMapping() to set reg bank for i1 arg, instead of setting reg class in IRTranslator.

arsenm · 2024-05-20T13:22:42Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+/// TODO: Investigate if reserving ScratchRSrcReg can be moved to calling conv
+/// functions. If so, then this function is not needed anymore -- we can just
+/// use CallLowering::determineAndHandleAssignments() as before.
+bool AMDGPUCallLowering::determineAndHandleAssignmentsLocal(


Instead of creating determineAndHandleAssignmentsLocal, would be better to just add an overload of determineAndHandleAssignments in the base class that takes the CCInfo as an argument. IIRC the DAG has a similar split

An overloaded function with a CCState argument has been created in the base class CallLowering.

arsenm · 2024-05-20T13:31:00Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+      AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+  assert(LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)


Could you avoid the determineAndHandleAssignments changes by using drop_front if !enableFlatScratch?

If we use drop_front, doesn't that mean we are assuming that the reserved reg is s[0:3]? How about putting in here something like the following:

if (!Subtarget.enableFlatScratch()) CCInfo.AllocateReg(Info->getScratchRSrcReg());

Yes, could also do that

Decided to create an overload function determineAndHandleAssignments() instead.

See above, might as well just use the 2 pieces of determineAndHandleAssignments instead

arsenm · 2024-05-20T13:32:20Z

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

+    if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) {
+      const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg);
+      if (DstRC)
+        return DstRC->contains(SrcReg);
+    }


Can you split this into a separate change, with MIR tests? I suspect you can just directly return true.

This should be split to a separate change

arsenm · 2024-05-20T13:32:51Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+  if (N3->getOpcode() != ISD::CALLSEQ_END)
+    return false;
+  return true;


Return bool expression

arsenm · 2024-05-20T13:33:11Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -15887,7 +15889,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) {
  SDNode *N2 = N1->getOperand(0).getNode();
  if (N2->getOpcode() != ISD::CopyFromReg)
    return false;
-  SDNode *N3 = N2->getOperand(0).getNode();
+


ping here, if UniformityAnalysis was correct I don't think you need this

a CCState argument.

in IRTranslator.

arsenm · 2024-05-29T15:12:06Z

llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h

@@ -407,6 +407,12 @@ class CallLowering {
      CallingConv::ID CallConv, bool IsVarArg,
      ArrayRef<Register> ThisReturnRegs = std::nullopt) const;

+  bool determineAndHandleAssignments(


Now that I think of it, I think it would be better to just use separate determineAssignments, and handleAssignments. determineAndHandleAssignments is already the simple case where the CCState doesn't need any custom handling

Still should undo this and just use the 2 separate pieces

Done. See commit df3a52e

arsenm · 2024-05-29T15:12:37Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+      AMDGPU::SGPR28, AMDGPU::SGPR29};
+
+  assert(LocVT == MVT::i1);
+  if (unsigned Reg = IsWave64 ? State.AllocateReg(SGPRArgsWave64)


See above, might as well just use the 2 pieces of determineAndHandleAssignments instead

arsenm · 2024-05-29T15:12:47Z

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

+    if (SrcReg.isPhysical() && SrcReg != AMDGPU::SCC) {
+      const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(DstReg);
+      if (DstRC)
+        return DstRC->contains(SrcReg);
+    }


This should be split to a separate change

arsenm · 2024-05-29T15:13:03Z

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

+    // For i1 function arguments, the call of getRegBank() currently gives
+    // incorrect result. We set both src and dst banks to VCCRegBank.
+    if (!MI.getOperand(1).getReg().isVirtual() &&
+        MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(1)) {
+      DstBank = SrcBank = &AMDGPU::VCCRegBank;
+    }
+
+    // For i1 return value, the dst reg is an SReg but we need to set the reg
+    // bank to VCCRegBank.
+    if (!MI.getOperand(0).getReg().isVirtual() &&
+        SrcBank == &AMDGPU::VCCRegBank)
+      DstBank = SrcBank;
+


Can you also make this a separate PR (I want 2, one for the RegBankSelect and one for the InstructionSelect parts)

Are you saying that the changes in AMDGPURegisterBankInfo.cpp should be a separate PR? Without those changes, I think some testcases will fail.

arsenm · 2024-05-29T15:14:24Z

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -15887,7 +15889,14 @@ static bool isCopyFromRegForI1Return(const SDNode *N) {
  SDNode *N2 = N1->getOperand(0).getNode();
  if (N2->getOpcode() != ISD::CopyFromReg)
    return false;
-  SDNode *N3 = N2->getOperand(0).getNode();
+


Yes, this whole function should be unnecessary. isArgPassedInSGPR needs to be updated, and the use for uniformity needs to be updated to account for this non-uniform, but also in SGPR case

ensure that the COPY instruction generated later on has sreg_64 as the the destnation reg class instead of vreg_1. This commit fixes that CopyFromReg node such that it uses the physical register assigned to the return value instead of the new virtual register. The purpose of doing this is to avoid having to create the function isCopyFromRegForI1Return() that is used in SITargetLowering::isSDNodeSourceOfDivergence() to get around the assert therein.

arsenm

should split out the pieces I mentioned and needs to update UniformityAnalysis hooks

arsenm · 2024-06-06T18:02:15Z

llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h

@@ -407,6 +407,12 @@ class CallLowering {
      CallingConv::ID CallConv, bool IsVarArg,
      ArrayRef<Register> ThisReturnRegs = std::nullopt) const;

+  bool determineAndHandleAssignments(


Still should undo this and just use the 2 separate pieces

arsenm · 2024-06-06T18:02:25Z

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

+    // For i1 function arguments, the call of getRegBank() currently gives
+    // incorrect result. We set both src and dst banks to VCCRegBank.
+    if (!MI.getOperand(1).getReg().isVirtual() &&
+        MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(1)) {
+      DstBank = SrcBank = &AMDGPU::VCCRegBank;
+    }
+
+    // For i1 return value, the dst reg is an SReg but we need to set the reg
+    // bank to VCCRegBank.
+    if (!MI.getOperand(0).getReg().isVirtual() &&
+        SrcBank == &AMDGPU::VCCRegBank)
+      DstBank = SrcBank;
+


CallLowering::determineAndHandleAssignments(). Instead, call determineAssignments() and handleAssignments() separately.

jwanggit86 added the backend:AMDGPU label Nov 16, 2023

jwanggit86 requested a review from arsenm November 16, 2023 01:54

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from 1c1086b to 3aa0909 Compare November 16, 2023 02:47

perlfu reviewed Nov 16, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/z_callee.ll Outdated Show resolved Hide resolved

arsenm requested changes Nov 17, 2023

View reviewed changes

tsymalla reviewed Nov 20, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp Outdated Show resolved Hide resolved

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from 3aa0909 to 9419956 Compare December 1, 2023 02:06

arsenm reviewed Dec 1, 2023

View reviewed changes

arsenm reviewed Dec 5, 2023

View reviewed changes

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from 9419956 to 0602c5d Compare December 21, 2023 23:48

llvmbot added the llvm:globalisel label Dec 21, 2023

arsenm reviewed Dec 22, 2023

View reviewed changes

arsenm reviewed Jan 17, 2024

View reviewed changes

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from 0602c5d to 822d2cd Compare February 2, 2024 22:09

arsenm reviewed Feb 6, 2024

View reviewed changes

jwanggit86 requested a review from arsenm February 20, 2024 00:32

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from b2a3304 to ab2fde2 Compare March 11, 2024 00:09

Jun Wang added 17 commits May 13, 2024 16:52

Fix formatting.

721c34d

Fixed (1) problems for global-isel wrt both incoming args and return

ca09ddd

value (2) a problem in AMDCallingConv.td when no sgprs are available.

Minor changes based on code review.

f26afca

Additional change based on code review.

26fa9cc

Changing a vector of 4 registers to a single register.

3b323d9

Update some test files.

b4c0bb9

Updated calling conv such that inreg i1 is promoted to i32 before

ad7d657

being allocated.

Add an additional CopyToReg and CopyFromReg for the CopyFromReg

d841a49

for the i1 return value.

Revert a formatting change made by clang-format.

df1bbe3

This commit: (1) fixed i1 array as func return (2) fixed i1 return when

4c098fe

GlobalISel is used (3) zeroext/signext in i1 return is ignored (4) inreg return of i1 is treated as i32 (5) new test files.

This commit: (1) a fix for i1 return with GlobalISel (2) testcases.

e6e574d

Fix formatting.

4f54c98

Use update_llc_test_checks.py on new test files; remove incorrect

a79ddae

comments in 2 new test files.

For GlobalISel: (1) for incoming i1 arg/return, do not generate G_TRUNC;

d3338c9

however, we need to set the register class to avoid later problems with instruction selection (2) for outgoing i1, do not differenciate between wavesize32 and wavesize64.

(1) avoid using reserved ScratchRSrcReg (2) update/add testcases.

4a82212

Testcase updates.

c0dfff7

Fix test file after merge from main.

6f2289b

jwanggit86 force-pushed the alloc-i1-arg-to-sgpr branch from 876e093 to 6f2289b Compare May 13, 2024 22:46

(1) Fix a problem with reserving ScratchRSrcD (2) update test files.

265d5c6

jwanggit86 requested a review from arsenm May 14, 2024 22:24

arsenm reviewed May 20, 2024

View reviewed changes

Jun Wang added 2 commits May 27, 2024 19:33

Overload the function CallLowering::determineAndHandleAssignments() with

6aaa564

a CCState argument.

For i1 arg, set reg bank in RegBankSelect, instead of setting reg class

4892dc7

in IRTranslator.

jwanggit86 requested a review from arsenm May 28, 2024 03:50

arsenm reviewed May 29, 2024

View reviewed changes

arsenm requested changes Jun 6, 2024

View reviewed changes

Undo 6aaa564, i.e., remove the overloaded

df3a52e

CallLowering::determineAndHandleAssignments(). Instead, call determineAssignments() and handleAssignments() separately.

arsenm mentioned this pull request Jul 24, 2024

[AMDGPU] Set register bank for i1 register copies #96155

Merged

arsenm mentioned this pull request Sep 15, 2024

[AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor #101609

Open

		if (VA.getLocVT().getSizeInBits() == 1 &&
		MRI.getTargetRegisterInfo()->getRegSizeInBits(PhysReg, MRI) == 64) {

		; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr16_sgpr17
		; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)

[AMDGPU] Allocate i1 argument to SGPRs #72461

Are you sure you want to change the base?

[AMDGPU] Allocate i1 argument to SGPRs #72461

Uh oh!

Conversation

jwanggit86 commented Nov 16, 2023

Uh oh!

llvmbot commented Nov 16, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Nov 16, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jwanggit86 commented Nov 16, 2023

Uh oh!

perlfu left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jwanggit86 commented Dec 1, 2023

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jwanggit86 Feb 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

llvmbot commented Nov 16, 2023 •

edited

Loading

github-actions bot commented Nov 16, 2023 •

edited

Loading

jwanggit86 Feb 7, 2024 •

edited

Loading

jwanggit86 commented Dec 5, 2023 •

edited

Loading