[AMDGPU] - Add s_bitreplicate intrinsic #69209

OutOfCache · 2023-10-16T14:12:42Z

Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG.

Support VGPR arguments by inserting a v_readfirstlane.

llvmbot · 2023-10-16T14:13:30Z

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Jessica Del (OutOfCache)

Changes

Part of SC1-5107.

Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG.

Support VGPR arguments by inserting a v_readfirstlane.

Full diff: https://github.com/llvm/llvm-project/pull/69209.diff

7 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+5)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+14)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll (+45)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..66aa7862893d049 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
   Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
             [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+
+def int_amdgcn_s_bitreplicate :
+  Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
 class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     [data_ty], 
     [
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..ff7b6724def9b1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2568,6 +2568,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
       llvm_unreachable("Unsupported size for inverse ballot mask.");
     }
     break;
+  case Intrinsic::amdgcn_s_bitreplicate:
+    Opcode = AMDGPU::S_BITREPLICATE_B64_B32;
+    break;
   default:
     SelectCode(N);
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..1b2f809f102d308 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1067,6 +1067,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return selectBallot(I);
   case Intrinsic::amdgcn_inverse_ballot:
     return selectInverseBallot(I);
+  case Intrinsic::amdgcn_s_bitreplicate:
+    return selectBitReplicate(I);
   case Intrinsic::amdgcn_reloc_constant:
     return selectRelocConstant(I);
   case Intrinsic::amdgcn_groupstaticsize:
@@ -1470,6 +1472,18 @@ bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectBitReplicate(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register MaskReg = I.getOperand(2).getReg();
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BITREPLICATE_B64_B32), DstReg)
+      .addReg(MaskReg);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..2bea2a5fe0804d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -114,6 +114,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectIntrinsicCmp(MachineInstr &MI) const;
   bool selectBallot(MachineInstr &I) const;
   bool selectInverseBallot(MachineInstr &I) const;
+  bool selectBitReplicate(MachineInstr &I) const;
   bool selectRelocConstant(MachineInstr &I) const;
   bool selectGroupStaticSize(MachineInstr &I) const;
   bool selectReturnAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..bbc0bc58ade3ada 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       applyMappingBFE(B, OpdMapper, false);
       return;
     case Intrinsic::amdgcn_inverse_ballot:
+    case Intrinsic::amdgcn_s_bitreplicate:
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 2); // Mask
       return;
@@ -4544,6 +4545,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
       break;
     }
+    case Intrinsic::amdgcn_s_bitreplicate:
+      Register MaskReg = MI.getOperand(2).getReg();
+      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
     }
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..295e878341b438e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6297,6 +6297,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     return CreatedBB;
   }
 
+  // Legalize S_BITREPLICATE
+  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+    MachineOperand &Src = MI.getOperand(1);
+    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return CreatedBB;
+  }
+
   // Legalize MIMG and MUBUF/MTBUF for shaders.
   //
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
new file mode 100644
index 000000000000000..027c9ef5e7cc349
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+
+declare i64 @llvm.amdgcn.s.bitreplicate(i32)
+
+define i64 @test_s_bitreplicate_constant() {
+; GFX11-LABEL: test_s_bitreplicate_constant:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
+  ret i64 %br
+}
+
+define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_bitreplicate_sgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+  store i64 %br, ptr addrspace(1) %out
+  ret void
+}
+
+define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
+; GFX11-LABEL: test_s_bitreplicate_vgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+  ret i64 %br
+}

OutOfCache · 2023-10-16T14:26:40Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+    MachineOperand &Src = MI.getOperand(1);
+    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return CreatedBB;
+  }


This unfortunately seems necessary.

The FixSGPRCopies pass incorrectly decides to turn s_bitreplicate into a VALU instruction when there is a VGPR input. This is a similar issue to D45826.

I refrained from writing a helper method for now because there would be only two calls for now. If it makes sense to add one, I will.

Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed

Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed

Even with the convergent attribute of the intrinsic? I thought we can rule out any divergent inputs with that.

No, that just maybe will prevent the optimizer from introducing new divergences. You cannot guarantee any arbitrary input was uniform from the start

There’s more discussions and reasoning about this in the main thread of this PR, but as far as I understand the intention, the intrinsic only accepts uniform values. From the intrinsic definition:

The argument must be uniform; otherwise, the result is undefined.

A uniform value can still end up in a VGPR, so it makes sense to me to use readfirstlane in that case.

jayfoad · 2023-10-16T14:37:04Z

Support VGPR arguments by inserting a v_readfirstlane.

Unfortunately this is not correct behaviour. You would need to either generate a waterfall loop, or find some other way to implement the bitreplicate operation using VALU instructions.

The problem is that, even if the input program only uses bitreplicate on uniform arguments, the compiler could theoretically transform input like this:

  result = divergent_condition ? bitreplicate(uniform_input) : bitreplicate(another_uniform_input);

into this:

  result = bitreplicate(divergent_condition ? uniform_input : another_uniform_input);

so the backend needs to be prepared to handle divergent arguments.

@arsenm @nhaehnle please correct me if I am wrong about this.

OutOfCache · 2023-10-16T14:41:52Z

Support VGPR arguments by inserting a v_readfirstlane.

Unfortunately this is not correct behaviour. You would need to either generate a waterfall loop, or find some other way to implement the bitreplicate operation using VALU instructions.

The problem is that, even if the input program only uses bitreplicate on uniform arguments, the compiler could theoretically transform input like this:
  result = divergent_condition ? bitreplicate(uniform_input) : bitreplicate(another_uniform_input);
into this:
  result = bitreplicate(divergent_condition ? uniform_input : another_uniform_input);
so the backend needs to be prepared to handle divergent arguments.

I see, that makes sense. A waterfall loop seems reasonable in that case. Thanks for pointing that out!

nhaehnle · 2023-10-23T01:36:23Z

The intrinsic is marked convergent precisely so that the waterfall loop isn't needed.

jayfoad · 2023-10-23T08:12:37Z

The intrinsic is marked convergent precisely so that the waterfall loop isn't needed.

I don't understand. What is the definition of the intrinsic? What result are you supposed to get if you give it a non-uniform input? (Maybe we should actually start documenting these instruction-like intrinsics, so we're not just arguing about the definition in a vacuum.)

nhaehnle · 2023-10-23T13:21:45Z

Yes, the intrinsic should be defined by a comment.

My thinking is that since there is no good vector expansion of S_BITREPLICATE, and the whole point of the intrinsic is to generate a faster code sequence when we know it can be used, we should define the intrinsic such that the argument must be uniform. And to be explicit, it is UB (or maybe returns poison, I don't really care, but it shouldn't be fully defined IMO) if the argument is not uniform.

This justifies the use of convergent (to prevent the kind of transforms you mentioned) and just putting a readfirstlane in the backend if the argument happens to be in a VGPR.

jayfoad · 2023-10-23T13:46:49Z

we should define the intrinsic such that the argument must be uniform

I can see why that's attractive but I have previously been told that it is not possible: https://reviews.llvm.org/D139817#3991676

This justifies the use of convergent (to prevent the kind of transforms you mentioned)

I don't think that works. I'm talking about transforming:

  %res1 = bitreplicate(%val1)
  %res2 = bitreplicate(%val2)
  %res = select %cond, %res1, %res2

into:

  %val = select %cond, %val1, %val2
  %res = bitreplicate(%val)

(where val1 and val2 are uniform but cond is divergent). I can't see how the convergent attribute would prevent that, since there is no divergent control flow here.

OutOfCache · 2023-10-23T14:43:36Z

  %res1 = bitreplicate(%val1)
  %res2 = bitreplicate(%val2)
  %res = select %cond, %res1, %res2

When I write a test like that, I get the following assembly output:

	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92          ; %res1
	v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 1, v0
	v_mov_b32_e32 v2, s1
	s_bitreplicate_b64_b32 s[2:3], 0x3a9285fe         ; %res2
	v_cmp_eq_u32_e32 vcc_lo, 1, v0
	v_cndmask_b32_e32 v0, s2, v1, vcc_lo
	v_cndmask_b32_e32 v1, s3, v2, vcc_lo
	s_setpc_b64 s[30:31]

So, in the end v1 has the value of %res, right? Is that not correct?

jayfoad · 2023-10-23T14:53:59Z

  %res1 = bitreplicate(%val1)
  %res2 = bitreplicate(%val2)
  %res = select %cond, %res1, %res2

When I write a test like that, I get the following assembly output:

	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92          ; %res1
	v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 1, v0
	v_mov_b32_e32 v2, s1
	s_bitreplicate_b64_b32 s[2:3], 0x3a9285fe         ; %res2
	v_cmp_eq_u32_e32 vcc_lo, 1, v0
	v_cndmask_b32_e32 v0, s2, v1, vcc_lo
	v_cndmask_b32_e32 v1, s3, v2, vcc_lo
	s_setpc_b64 s[30:31]

So, in the end v1 has the value of %res, right? Is that not correct?

Yes, that is fine. But I'm saying that a generic IR optimization would be allowed to transform that IR into this IR (even if bitreplicate is marked as convergent):

  %val = select %cond, %val1, %val2
  %res = bitreplicate(%val)

nhaehnle · 2023-10-23T17:19:30Z

The intended reasoning for why this generic transform is forbidden when bitreplicate is convergent is as follows.

Here's ConvergentOperations.rst:

A convergent operation involves inter-thread communication or synchronization
that occurs outside of the memory model, where the set of threads which
participate in communication is implicitly affected by control flow.

Now it seems we maybe never explicitly state this, but what this means is that even if the operation is memory(none), participating threads can still see all the other participating threads' function arguments.

Let's say we have two threads, val1 = 1, val2 = 2, and cond is false in thread 0 and true in thread 1. Then in the original program,

  %res1 = bitreplicate(%val1)
  %res2 = bitreplicate(%val2)
  %res = select %cond, %res1, %res2

Thread 0 participates in two bitreplicates. Both times, it sees one other thread participating. In the first bitreplicate, both threads provide the value 1; in the second, both threads provide the value 2.

In the transformed program,

  %val = select %cond, %val1, %val2
  %res = bitreplicate(%val)

Thread 0 participates only in a single bitreplicate. That in itself is not a problem since the bitreplicate can't have side effects (because it's memory(none)). But the argument value provided by thread 0 is 1, while the thread 1 provides the value 2. This combination of arguments provided by participating threads is something that never happened in the execution of the original program, so without further knowledge about what bitreplicate does, a generic transform must assume that the resulting value of res may be different, and therefore the transform is not allowed.

jayfoad · 2023-10-23T18:46:53Z

The intended reasoning for why this generic transform is forbidden when bitreplicate is convergent is as follows.

Thanks for explaining. It would be great if we had an Alive that understood convergent that could prove things like this.

OutOfCache · 2023-10-24T05:44:29Z

Is there a way to add a test demonstrating this behavior or is that not possible/necessary? I could add the IR test based on this scenario, but is that enough to prove the illegality of the transform?

arsenm

Should retitle to add bitreplicate intrinsic

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

arsenm · 2023-10-24T06:03:08Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+    MachineOperand &Src = MI.getOperand(1);
+    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return CreatedBB;
+  }


Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed

nhaehnle

I think this is good, though there should still be a comment in IntrinsicsAMDGPU.td describing the constraints / definition of the intrinsic.

Emitting readfirstlane is just wrong, you have to emulate the operation with a VALU expansion if needed

That's how we usually do it, but this case is a bit unusual. It's the combination of:

The whole point of adding the intrinsic is to expose a corner case optimization opportunity provided by this SALU instruction
All VALU expansions I can think of are completely horrible, leaving a massive performance footgun if the intrinsic ever happens to run into a VGPR input
We wouldn't have good test coverage for the complex VALU expansion

The cheapest VALU expansion I can think of is:

2x v_perm_b32 to distribute bytes of the input
For 3 levels of bit width (4, 2, 1), do:
- 4x v_and_b32 to mask out low and high halves
- 2x v_lshl_or_b32 for shifted combine of the masked parts

... which amounts to 20 VALU instructions in place of 1 SALU instruction.

For these reasons, I recommended to define the intrinsic such that it's only defined for uniform inputs.

nhaehnle · 2023-10-24T16:28:04Z

I forgot a final lshl_or in the sequence above, but it randomly occurred to me that there is a slightly better vector code sequence:

2x v_perm_b32 to distribute bytes of the input
For 3 levels of bit width (4, 2, 1), do:
- 2x v_lshl_or_b32 to shift the high halves of each bit group
- 2x v_and_b32 to mask out the "dirt" leftover from the lshl_or
2x v_lshl_or_b32 for the final bit replication

That's slightly better at 16 VALU instructions, but still a massive performance cliff. I still think it's justified to have an intrinsic that only allows uniform inputs.

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

jayfoad

Looks good overall.

jayfoad · 2023-10-27T09:37:40Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll

+; GFX11-NEXT:    s_endpgm
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+  store i64 %br, ptr addrspace(1) %out


Any particular reason to use store i64 instead of ret i64 here?

As I understand it, the amdgcn_cs calling convention does not return anything. So, I need to store the result to keep it.

I needed this (or a similar) cc, because then the inreg attribute loads the arguments into a SGPR instead of a VGPR. If there is a better way to do so, I am open to any suggestions!

Thanks for explaining. I had not noticed that this test uses amdgpu_cs and the others do not.

I admit it is a bit odd at first glance. However, I did not want to 'clutter' the other tests with the stores, so I made an exception for this case.

jayfoad · 2023-10-27T09:38:31Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll

+; GFX11-LABEL: test_s_bitreplicate_constant:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92


As a follow up you could implement constant folding for this in lib/Analysis/ConstantFolding.cpp.

jayfoad · 2023-10-27T09:42:31Z

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+// Lowers to S_BITREPLICATE_B64_B32.
+// The argument must be uniform; otherwise, the result is undefined.
+def int_amdgcn_s_bitreplicate :
+  Intrinsic<[llvm_i64_ty], [llvm_i32_ty],


Use DefaultAttrsIntrinsic to imply all the properties defined with IntrinsicProperty<1> in Intrinsics.td.

Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG. Support VGPR arguments by inserting a `v_readfirstlane`.

OutOfCache added the backend:AMDGPU label Oct 16, 2023

OutOfCache requested review from arsenm and nhaehnle October 16, 2023 14:12

llvmbot added the llvm:ir label Oct 16, 2023

OutOfCache commented Oct 16, 2023

View reviewed changes

arsenm reviewed Oct 24, 2023

View reviewed changes

OutOfCache changed the title ~~[AMDGPU] - Generate s_bitreplicate_b64_b32~~ [AMDGPU] - Add s_bitreplicate intrinsic Oct 24, 2023

OutOfCache requested a review from arsenm October 24, 2023 12:42

nhaehnle reviewed Oct 24, 2023

View reviewed changes

OutOfCache requested a review from nhaehnle October 25, 2023 10:44

tsymalla reviewed Oct 25, 2023

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAMDGPU.td Outdated Show resolved Hide resolved

tsymalla reviewed Oct 25, 2023

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAMDGPU.td Outdated Show resolved Hide resolved

jayfoad reviewed Oct 27, 2023

View reviewed changes

nhaehnle approved these changes Oct 31, 2023

View reviewed changes

[AMDGPU] - Generate s_bitreplicate_b64_b32

e9291a9

Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG. Support VGPR arguments by inserting a `v_readfirstlane`.

OutOfCache force-pushed the s-bitreplicate-intrinsic branch from 1dc836b to e9291a9 Compare October 31, 2023 09:05

OutOfCache merged commit b8d3ccd into llvm:main Oct 31, 2023

OutOfCache deleted the s-bitreplicate-intrinsic branch October 31, 2023 10:26

OutOfCache mentioned this pull request Nov 15, 2023

[AMDGPU] - Add constant folding for s_bitreplicate #72366

Merged

[AMDGPU] - Add s_bitreplicate intrinsic #69209

[AMDGPU] - Add s_bitreplicate intrinsic #69209

Uh oh!

Conversation

OutOfCache commented Oct 16, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 16, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jayfoad commented Oct 16, 2023

Uh oh!

OutOfCache commented Oct 16, 2023

Uh oh!

nhaehnle commented Oct 23, 2023

Uh oh!

jayfoad commented Oct 23, 2023

Uh oh!

nhaehnle commented Oct 23, 2023

Uh oh!

jayfoad commented Oct 23, 2023

Uh oh!

OutOfCache commented Oct 23, 2023

Uh oh!

jayfoad commented Oct 23, 2023

Uh oh!

nhaehnle commented Oct 23, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jayfoad commented Oct 23, 2023

Uh oh!

OutOfCache commented Oct 24, 2023

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nhaehnle left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nhaehnle commented Oct 24, 2023

Uh oh!

Uh oh!

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

OutOfCache commented Oct 16, 2023 •

edited

Loading

llvmbot commented Oct 16, 2023 •

edited

Loading

nhaehnle commented Oct 23, 2023 •

edited

Loading

nhaehnle left a comment •

edited

Loading