-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] - Add s_bitreplicate intrinsic #69209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Jessica Del (OutOfCache) ChangesPart of SC1-5107. Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG. Support VGPR arguments by inserting a Full diff: https://github.com/llvm/llvm-project/pull/69209.diff 7 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..66aa7862893d049 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_bitreplicate :
+ Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[data_ty],
[
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..ff7b6724def9b1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2568,6 +2568,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
llvm_unreachable("Unsupported size for inverse ballot mask.");
}
break;
+ case Intrinsic::amdgcn_s_bitreplicate:
+ Opcode = AMDGPU::S_BITREPLICATE_B64_B32;
+ break;
default:
SelectCode(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..1b2f809f102d308 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1067,6 +1067,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectBallot(I);
case Intrinsic::amdgcn_inverse_ballot:
return selectInverseBallot(I);
+ case Intrinsic::amdgcn_s_bitreplicate:
+ return selectBitReplicate(I);
case Intrinsic::amdgcn_reloc_constant:
return selectRelocConstant(I);
case Intrinsic::amdgcn_groupstaticsize:
@@ -1470,6 +1472,18 @@ bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectBitReplicate(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register MaskReg = I.getOperand(2).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BITREPLICATE_B64_B32), DstReg)
+ .addReg(MaskReg);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..2bea2a5fe0804d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -114,6 +114,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectIntrinsicCmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
bool selectInverseBallot(MachineInstr &I) const;
+ bool selectBitReplicate(MachineInstr &I) const;
bool selectRelocConstant(MachineInstr &I) const;
bool selectGroupStaticSize(MachineInstr &I) const;
bool selectReturnAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..bbc0bc58ade3ada 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingBFE(B, OpdMapper, false);
return;
case Intrinsic::amdgcn_inverse_ballot:
+ case Intrinsic::amdgcn_s_bitreplicate:
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2); // Mask
return;
@@ -4544,6 +4545,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
+ case Intrinsic::amdgcn_s_bitreplicate:
+ Register MaskReg = MI.getOperand(2).getReg();
+ unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+ OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
}
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..295e878341b438e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6297,6 +6297,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
+ // Legalize S_BITREPLICATE
+ if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return CreatedBB;
+ }
+
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
new file mode 100644
index 000000000000000..027c9ef5e7cc349
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i64 @llvm.amdgcn.s.bitreplicate(i32)
+
+define i64 @test_s_bitreplicate_constant() {
+; GFX11-LABEL: test_s_bitreplicate_constant:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
+ ret i64 %br
+}
+
+define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_bitreplicate_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ store i64 %br, ptr addrspace(1) %out
+ ret void
+}
+
+define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
+; GFX11-LABEL: test_s_bitreplicate_vgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ ret i64 %br
+}
|
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) { | ||
MachineOperand &Src = MI.getOperand(1); | ||
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | ||
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | ||
return CreatedBB; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This unfortunately seems necessary.
The FixSGPRCopies pass incorrectly decides to turn s_bitreplicate
into a VALU instruction when there is a VGPR input. This is a similar issue to D45826.
I refrained from writing a helper method for now because there would be only two calls for now. If it makes sense to add one, I will.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed
Even with the convergent
attribute of the intrinsic? I thought we can rule out any divergent inputs with that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, that just maybe will prevent the optimizer from introducing new divergences. You cannot guarantee any arbitrary input was uniform from the start
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There’s more discussions and reasoning about this in the main thread of this PR, but as far as I understand the intention, the intrinsic only accepts uniform values. From the intrinsic definition:
The argument must be uniform; otherwise, the result is undefined.
A uniform value can still end up in a VGPR, so it makes sense to me to use readfirstlane in that case.
Unfortunately this is not correct behaviour. You would need to either generate a waterfall loop, or find some other way to implement the bitreplicate operation using VALU instructions. The problem is that, even if the input program only uses bitreplicate on uniform arguments, the compiler could theoretically transform input like this:
into this:
so the backend needs to be prepared to handle divergent arguments. @arsenm @nhaehnle please correct me if I am wrong about this. |
I see, that makes sense. A waterfall loop seems reasonable in that case. Thanks for pointing that out! |
The intrinsic is marked |
I don't understand. What is the definition of the intrinsic? What result are you supposed to get if you give it a non-uniform input? (Maybe we should actually start documenting these instruction-like intrinsics, so we're not just arguing about the definition in a vacuum.) |
Yes, the intrinsic should be defined by a comment. My thinking is that since there is no good vector expansion of S_BITREPLICATE, and the whole point of the intrinsic is to generate a faster code sequence when we know it can be used, we should define the intrinsic such that the argument must be uniform. And to be explicit, it is UB (or maybe returns poison, I don't really care, but it shouldn't be fully defined IMO) if the argument is not uniform. This justifies the use of |
I can see why that's attractive but I have previously been told that it is not possible: https://reviews.llvm.org/D139817#3991676
I don't think that works. I'm talking about transforming:
into:
(where |
When I write a test like that, I get the following assembly output:
So, in the end |
Yes, that is fine. But I'm saying that a generic IR optimization would be allowed to transform that IR into this IR (even if bitreplicate is marked as convergent):
|
The intended reasoning for why this generic transform is forbidden when Here's ConvergentOperations.rst:
Now it seems we maybe never explicitly state this, but what this means is that even if the operation is Let's say we have two threads, %res1 = bitreplicate(%val1)
%res2 = bitreplicate(%val2)
%res = select %cond, %res1, %res2 Thread 0 participates in two In the transformed program, %val = select %cond, %val1, %val2
%res = bitreplicate(%val) Thread 0 participates only in a single |
Thanks for explaining. It would be great if we had an Alive that understood |
Is there a way to add a test demonstrating this behavior or is that not possible/necessary? I could add the IR test based on this scenario, but is that enough to prove the illegality of the transform? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should retitle to add bitreplicate intrinsic
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) { | ||
MachineOperand &Src = MI.getOperand(1); | ||
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | ||
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | ||
return CreatedBB; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is good, though there should still be a comment in IntrinsicsAMDGPU.td describing the constraints / definition of the intrinsic.
Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed
That's how we usually do it, but this case is a bit unusual. It's the combination of:
- The whole point of adding the intrinsic is to expose a corner case optimization opportunity provided by this SALU instruction
- All VALU expansions I can think of are completely horrible, leaving a massive performance footgun if the intrinsic ever happens to run into a VGPR input
- We wouldn't have good test coverage for the complex VALU expansion
The cheapest VALU expansion I can think of is:
- 2x v_perm_b32 to distribute bytes of the input
- For 3 levels of bit width (4, 2, 1), do:
- 4x v_and_b32 to mask out low and high halves
- 2x v_lshl_or_b32 for shifted combine of the masked parts
... which amounts to 20 VALU instructions in place of 1 SALU instruction.
For these reasons, I recommended to define the intrinsic such that it's only defined for uniform inputs.
I forgot a final lshl_or in the sequence above, but it randomly occurred to me that there is a slightly better vector code sequence:
That's slightly better at 16 VALU instructions, but still a massive performance cliff. I still think it's justified to have an intrinsic that only allows uniform inputs. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good overall.
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask) | ||
store i64 %br, ptr addrspace(1) %out |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any particular reason to use store i64
instead of ret i64
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I understand it, the amdgcn_cs
calling convention does not return anything. So, I need to store the result to keep it.
I needed this (or a similar) cc, because then the inreg
attribute loads the arguments into a SGPR instead of a VGPR. If there is a better way to do so, I am open to any suggestions!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for explaining. I had not noticed that this test uses amdgpu_cs
and the others do not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I admit it is a bit odd at first glance. However, I did not want to 'clutter' the other tests with the stores, so I made an exception for this case.
; GFX11-LABEL: test_s_bitreplicate_constant: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As a follow up you could implement constant folding for this in lib/Analysis/ConstantFolding.cpp
.
// Lowers to S_BITREPLICATE_B64_B32. | ||
// The argument must be uniform; otherwise, the result is undefined. | ||
def int_amdgcn_s_bitreplicate : | ||
Intrinsic<[llvm_i64_ty], [llvm_i32_ty], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use DefaultAttrsIntrinsic to imply all the properties defined with IntrinsicProperty<1>
in Intrinsics.td
.
Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG. Support VGPR arguments by inserting a `v_readfirstlane`.
1dc836b
to
e9291a9
Compare
Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG.
Support VGPR arguments by inserting a
v_readfirstlane
.