Skip to content

Commit b8d3ccd

Browse files
authored
[AMDGPU] - Add s_bitreplicate intrinsic (#69209)
Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32 machine instruction in both GISel and Selection DAG. Support VGPR arguments by inserting a `v_readfirstlane`.
1 parent 6a06155 commit b8d3ccd

File tree

5 files changed

+66
-1
lines changed

5 files changed

+66
-1
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
19271927
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
19281928
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
19291929

1930+
// Lowers to S_BITREPLICATE_B64_B32.
1931+
// The argument must be uniform; otherwise, the result is undefined.
1932+
def int_amdgcn_s_bitreplicate :
1933+
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1934+
19301935
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
19311936
[data_ty],
19321937
[

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29942994
applyMappingBFE(B, OpdMapper, false);
29952995
return;
29962996
case Intrinsic::amdgcn_inverse_ballot:
2997+
case Intrinsic::amdgcn_s_bitreplicate:
29972998
applyDefaultMapping(OpdMapper);
29982999
constrainOpWithReadfirstlane(B, MI, 2); // Mask
29993000
return;
@@ -4546,6 +4547,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45464547
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
45474548
break;
45484549
}
4550+
case Intrinsic::amdgcn_s_bitreplicate:
4551+
Register MaskReg = MI.getOperand(2).getReg();
4552+
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4553+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4554+
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
45494555
}
45504556
break;
45514557
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6473,6 +6473,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
64736473
return CreatedBB;
64746474
}
64756475

6476+
// Legalize S_BITREPLICATE
6477+
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
6478+
MachineOperand &Src = MI.getOperand(1);
6479+
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6480+
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6481+
return CreatedBB;
6482+
}
6483+
64766484
// Legalize MIMG and MUBUF/MTBUF for shaders.
64776485
//
64786486
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,8 @@ let SubtargetPredicate = isGFX9Plus in {
362362
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
363363

364364
let isReMaterializable = 1 in
365-
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
365+
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32",
366+
[(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>;
366367
} // End SubtargetPredicate = isGFX9Plus
367368

368369
let SubtargetPredicate = isGFX10Plus in {
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
4+
5+
declare i64 @llvm.amdgcn.s.bitreplicate(i32)
6+
7+
define i64 @test_s_bitreplicate_constant() {
8+
; GFX11-LABEL: test_s_bitreplicate_constant:
9+
; GFX11: ; %bb.0: ; %entry
10+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11+
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
12+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
13+
; GFX11-NEXT: s_setpc_b64 s[30:31]
14+
entry:
15+
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
16+
ret i64 %br
17+
}
18+
19+
define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
20+
; GFX11-LABEL: test_s_bitreplicate_sgpr:
21+
; GFX11: ; %bb.0: ; %entry
22+
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
23+
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
24+
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
25+
; GFX11-NEXT: s_nop 0
26+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
27+
; GFX11-NEXT: s_endpgm
28+
entry:
29+
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
30+
store i64 %br, ptr addrspace(1) %out
31+
ret void
32+
}
33+
34+
define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
35+
; GFX11-LABEL: test_s_bitreplicate_vgpr:
36+
; GFX11: ; %bb.0: ; %entry
37+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
39+
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
40+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
41+
; GFX11-NEXT: s_setpc_b64 s[30:31]
42+
entry:
43+
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
44+
ret i64 %br
45+
}

0 commit comments

Comments
 (0)