Skip to content

Commit 7cd2974

Browse files
authored
[AMDGPU] Extend mov_dpp8 intrinsic lowering for generic types (#114296)
The int_amdgcn_mov_dpp8 is overloaded, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported values.
1 parent a776bd1 commit 7cd2974

File tree

5 files changed

+174
-7
lines changed

5 files changed

+174
-7
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2461,7 +2461,7 @@ def int_amdgcn_permlanex16 :
24612461
// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
24622462
// the lanes to read from.
24632463
def int_amdgcn_mov_dpp8 :
2464-
Intrinsic<[llvm_anyint_ty],
2464+
Intrinsic<[llvm_any_ty],
24652465
[LLVMMatchType<0>, llvm_i32_ty],
24662466
[IntrNoMem, IntrConvergent, IntrWillReturn,
24672467
ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5494,6 +5494,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54945494
.addImm(Src5)
54955495
.getReg(0);
54965496
}
5497+
case Intrinsic::amdgcn_mov_dpp8:
5498+
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
54975499
default:
54985500
llvm_unreachable("unhandled lane op");
54995501
}
@@ -7529,6 +7531,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75297531
case Intrinsic::amdgcn_permlane64:
75307532
case Intrinsic::amdgcn_set_inactive:
75317533
case Intrinsic::amdgcn_set_inactive_chain_arg:
7534+
case Intrinsic::amdgcn_mov_dpp8:
75327535
return legalizeLaneOp(Helper, MI, IntrID);
75337536
case Intrinsic::amdgcn_s_buffer_prefetch_data:
75347537
return legalizeSBufferPrefetch(Helper, MI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6182,6 +6182,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61826182
case Intrinsic::amdgcn_readlane:
61836183
case Intrinsic::amdgcn_set_inactive:
61846184
case Intrinsic::amdgcn_set_inactive_chain_arg:
6185+
case Intrinsic::amdgcn_mov_dpp8:
61856186
Operands.push_back(Src1);
61866187
[[fallthrough]];
61876188
case Intrinsic::amdgcn_readfirstlane:
@@ -6208,7 +6209,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62086209
SDValue Src0 = N->getOperand(1);
62096210
SDValue Src1, Src2;
62106211
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6211-
IsSetInactive || IsPermLane16) {
6212+
IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
62126213
Src1 = N->getOperand(2);
62136214
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
62146215
Src2 = N->getOperand(3);
@@ -8834,6 +8835,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
88348835
case Intrinsic::amdgcn_permlane64:
88358836
case Intrinsic::amdgcn_set_inactive:
88368837
case Intrinsic::amdgcn_set_inactive_chain_arg:
8838+
case Intrinsic::amdgcn_mov_dpp8:
88378839
return lowerLaneOp(*this, Op.getNode(), DAG);
88388840
default:
88398841
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1508,12 +1508,14 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
15081508
defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
15091509
defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
15101510

1511-
class MovDPP8Pattern<Predicate Pred, Instruction Inst> : GCNPat <
1512-
(i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
1511+
class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
1512+
(vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
15131513
(Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> {
15141514
let OtherPredicates = [Pred];
15151515
}
15161516

1517-
def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10>;
1518-
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11>;
1519-
def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12>;
1517+
foreach vt = Reg32Types.types in {
1518+
def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>;
1519+
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>;
1520+
def : MovDPP8Pattern<isGFX12Only, V_MOV_B32_dpp8_gfx12, vt>;
1521+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
2424
ret void
2525
}
2626

27+
; GFX10PLUS-LABEL: {{^}}dpp8_i64:
28+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
29+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
30+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
31+
define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
32+
%tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1)
33+
store i64 %tmp0, ptr addrspace(1) %out
34+
ret void
35+
}
36+
37+
; GFX10PLUS-LABEL: {{^}}dpp8_v2i32:
38+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
39+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
40+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
41+
define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) {
42+
%tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1)
43+
store <2 x i32> %tmp0, ptr addrspace(1) %out
44+
ret void
45+
}
46+
47+
; GFX10PLUS-LABEL: {{^}}dpp8_v3i32:
48+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
49+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
50+
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
51+
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
52+
define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) {
53+
%tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1)
54+
store <3 x i32> %tmp0, ptr addrspace(1) %out
55+
ret void
56+
}
57+
58+
; GFX10PLUS-LABEL: {{^}}dpp8_v4i32:
59+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
60+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
61+
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
62+
; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
63+
; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
64+
define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) {
65+
%tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1)
66+
store <4 x i32> %tmp0, ptr addrspace(1) %out
67+
ret void
68+
}
69+
70+
; GFX10PLUS-LABEL: {{^}}dpp8_p0:
71+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
72+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
73+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
74+
define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) {
75+
%tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1)
76+
store ptr %tmp0, ptr addrspace(1) %out
77+
ret void
78+
}
79+
80+
; GFX10PLUS-LABEL: {{^}}dpp8_p3:
81+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
82+
; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
83+
define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) {
84+
%tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1)
85+
store ptr addrspace(3) %tmp0, ptr addrspace(1) %out
86+
ret void
87+
}
88+
89+
; GFX10PLUS-LABEL: {{^}}dpp8_v3p3:
90+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
91+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
92+
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
93+
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
94+
define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) {
95+
%tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1)
96+
store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out
97+
ret void
98+
}
99+
100+
; GFX10PLUS-LABEL: {{^}}dpp8_i16:
101+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
102+
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
103+
define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) {
104+
%tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1)
105+
store i16 %tmp0, ptr addrspace(1) %out
106+
ret void
107+
}
108+
109+
; GFX10PLUS-LABEL: {{^}}dpp8_v4i16:
110+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
111+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
112+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
113+
define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) {
114+
%tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1)
115+
store <4 x i16> %tmp0, ptr addrspace(1) %out
116+
ret void
117+
}
118+
119+
; GFX10PLUS-LABEL: {{^}}dpp8_v4f16:
120+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
121+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
122+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
123+
define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) {
124+
%tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1)
125+
store <4 x half> %tmp0, ptr addrspace(1) %out
126+
ret void
127+
}
128+
129+
; GFX10PLUS-LABEL: {{^}}dpp8_float:
130+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
131+
; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
132+
define amdgpu_ps void @dpp8_float(float %in, ptr addrspace(1) %out) {
133+
%tmp0 = call float @llvm.amdgcn.mov.dpp8.f32(float %in, i32 1)
134+
store float %tmp0, ptr addrspace(1) %out
135+
ret void
136+
}
137+
138+
; GFX10PLUS-LABEL: {{^}}dpp8_v3f32:
139+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
140+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
141+
; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
142+
; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
143+
define amdgpu_ps void @dpp8_v3f32(<3 x float> %in, ptr addrspace(1) %out) {
144+
%tmp0 = call <3 x float> @llvm.amdgcn.mov.dpp8.v3f32(<3 x float> %in, i32 1)
145+
store <3 x float> %tmp0, ptr addrspace(1) %out
146+
ret void
147+
}
148+
149+
; GFX10PLUS-LABEL: {{^}}dpp8_half:
150+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
151+
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
152+
define amdgpu_ps void @dpp8_half(half %in, ptr addrspace(1) %out) {
153+
%tmp0 = call half @llvm.amdgcn.mov.dpp8.f16(half %in, i32 1)
154+
store half %tmp0, ptr addrspace(1) %out
155+
ret void
156+
}
157+
158+
; GFX10PLUS-LABEL: {{^}}dpp8_bfloat:
159+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
160+
; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
161+
define amdgpu_ps void @dpp8_bfloat(bfloat %in, ptr addrspace(1) %out) {
162+
%tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16(bfloat %in, i32 1)
163+
store bfloat %tmp0, ptr addrspace(1) %out
164+
ret void
165+
}
166+
167+
; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16:
168+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
169+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
170+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
171+
define amdgpu_ps void @dpp8_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
172+
%tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16(<4 x bfloat> %in, i32 1)
173+
store <4 x bfloat> %tmp0, ptr addrspace(1) %out
174+
ret void
175+
}
176+
177+
; GFX10PLUS-LABEL: {{^}}dpp8_double:
178+
; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
179+
; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
180+
; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
181+
define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) {
182+
%tmp0 = call double @llvm.amdgcn.mov.dpp8.f64(double %in, i32 1)
183+
store double %tmp0, ptr addrspace(1) %out
184+
ret void
185+
}
186+
27187
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
28188

29189
attributes #0 = { nounwind readnone convergent }

0 commit comments

Comments
 (0)