Skip to content

Commit af5fd14

Browse files
committed
[AMDGPU] Extend f32 support for llvm.amdgcn.update.dpp intrinsic
This will be useful to avoid the bit-casting noise required to extend support for Floating Point Operations in atomic optimizer for DPP in D156301 Reviewed By: arsenm, #amdgpu Differential Revision: https://reviews.llvm.org/D156647
1 parent 81827f8 commit af5fd14

File tree

3 files changed

+130
-8
lines changed

3 files changed

+130
-8
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2177,7 +2177,7 @@ def int_amdgcn_mov_dpp :
21772177
// v_mov_b32 <dest> <old>
21782178
// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
21792179
def int_amdgcn_update_dpp :
2180-
Intrinsic<[llvm_anyint_ty],
2180+
Intrinsic<[llvm_any_ty],
21812181
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
21822182
llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
21832183
[IntrNoMem, IntrConvergent, IntrWillReturn,

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,15 +1203,18 @@ def : GCNPat <
12031203
(as_i1timm $bound_ctrl))
12041204
>;
12051205

1206-
def : GCNPat <
1207-
(i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl,
1206+
class UpdateDPPPat<ValueType vt> : GCNPat <
1207+
(vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl,
12081208
timm:$row_mask, timm:$bank_mask,
12091209
timm:$bound_ctrl)),
12101210
(V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl),
12111211
(as_i32timm $row_mask), (as_i32timm $bank_mask),
12121212
(as_i1timm $bound_ctrl))
12131213
>;
12141214

1215+
def : UpdateDPPPat<i32>;
1216+
def : UpdateDPPPat<f32>;
1217+
12151218
} // End OtherPredicates = [isGFX8Plus]
12161219

12171220
let OtherPredicates = [isGFX8Plus] in {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Lines changed: 124 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
; GFX8-NOOPT: s_nop 1
1212
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
1313
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
14-
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
14+
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
1515
store i32 %tmp0, ptr addrspace(1) %out
1616
ret void
1717
}
@@ -24,7 +24,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
2424
; GFX8-NOOPT: s_nop 1
2525
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}}
2626
define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
27-
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
27+
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
2828
store i32 %tmp0, ptr addrspace(1) %out
2929
ret void
3030
}
@@ -63,7 +63,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
6363
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
6464
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
6565
%load = load i64, ptr addrspace(1) %gep
66-
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
66+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
6767
store i64 %tmp0, ptr addrspace(1) %gep
6868
ret void
6969
}
@@ -83,7 +83,7 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
8383
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
8484
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
8585
%load = load i64, ptr addrspace(1) %gep
86-
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
86+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
8787
store i64 %tmp0, ptr addrspace(1) %gep
8888
ret void
8989
}
@@ -98,14 +98,133 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
9898
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
9999
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
100100
define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
101-
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
101+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
102102
store i64 %tmp0, ptr addrspace(1) %out
103103
ret void
104104
}
105105

106+
; GCN-LABEL: {{^}}dpp_test_f32:
107+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
108+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
109+
; GFX8-OPT: s_mov
110+
; GFX8-OPT: s_mov
111+
; GFX8-NOOPT: s_nop 1
112+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
113+
define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) {
114+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
115+
store float %tmp0, ptr addrspace(1) %out
116+
ret void
117+
}
118+
119+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1:
120+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
121+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
122+
; GFX8-OPT: s_mov
123+
; GFX8-OPT: s_mov
124+
; GFX8-NOOPT: s_nop 1
125+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
126+
define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) {
127+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
128+
store float %tmp0, ptr addrspace(1) %out
129+
ret void
130+
}
131+
132+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2:
133+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
134+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
135+
; GFX8-OPT: s_mov
136+
; GFX8-OPT: s_mov
137+
; GFX8-NOOPT: s_nop 1
138+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
139+
define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) {
140+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
141+
store float %tmp0, ptr addrspace(1) %out
142+
ret void
143+
}
144+
145+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3:
146+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
147+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
148+
; GFX8-OPT: s_mov
149+
; GFX8-OPT: s_mov
150+
; GFX8-NOOPT: s_nop 1
151+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
152+
define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) {
153+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
154+
store float %tmp0, ptr addrspace(1) %out
155+
ret void
156+
}
157+
158+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4:
159+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
160+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
161+
; GFX8-OPT: s_mov
162+
; GFX8-OPT: s_mov
163+
; GFX8-NOOPT: s_nop 1
164+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
165+
define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) {
166+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
167+
store float %tmp0, ptr addrspace(1) %out
168+
ret void
169+
}
170+
171+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5:
172+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
173+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
174+
; GFX8-OPT: s_mov
175+
; GFX8-OPT: s_mov
176+
; GFX8-NOOPT: s_nop 1
177+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
178+
define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
179+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true)
180+
store float %tmp0, ptr addrspace(1) %out
181+
ret void
182+
}
183+
184+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6:
185+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
186+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
187+
; GFX8-OPT: s_mov
188+
; GFX8-OPT: s_mov
189+
; GFX8-NOOPT: s_nop 1
190+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
191+
define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
192+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true)
193+
store float %tmp0, ptr addrspace(1) %out
194+
ret void
195+
}
196+
197+
198+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7:
199+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
200+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
201+
; GFX8-OPT: s_mov
202+
; GFX8-OPT: s_mov
203+
; GFX8-NOOPT: s_nop 1
204+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
205+
define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
206+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true)
207+
store float %tmp0, ptr addrspace(1) %out
208+
ret void
209+
}
210+
211+
; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8:
212+
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
213+
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
214+
; GFX8-OPT: s_mov
215+
; GFX8-OPT: s_mov
216+
; GFX8-NOOPT: s_nop 1
217+
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
218+
define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
219+
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true)
220+
store float %tmp0, ptr addrspace(1) %out
221+
ret void
222+
}
223+
106224
declare i32 @llvm.amdgcn.workitem.id.x()
107225
declare void @llvm.amdgcn.s.barrier()
108226
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
227+
declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
109228
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
110229

111230
attributes #0 = { nounwind readnone convergent }

0 commit comments

Comments
 (0)