Skip to content

Commit 75e6f0b

Browse files
committed
AMDGPU: Add flag to disable promotion of uniform i16 ops
This interferes with GlobalISel's much better handling of the situation. This should really be disable for GlobalISel. However, the fallback only re-runs the selection passes, and doesn't go back and rerun any codegen IR passes. I haven't come up with a good solution to this problem.
1 parent 2b7a2cb commit 75e6f0b

File tree

3 files changed

+61
-148
lines changed

3 files changed

+61
-148
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ static cl::opt<bool> WidenLoads(
6060
cl::ReallyHidden,
6161
cl::init(false));
6262

63+
static cl::opt<bool> Widen16BitOps(
64+
"amdgpu-codegenprepare-widen-16-bit-ops",
65+
cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
66+
cl::ReallyHidden,
67+
cl::init(true));
68+
6369
static cl::opt<bool> UseMul24Intrin(
6470
"amdgpu-codegenprepare-mul24",
6571
cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -269,6 +275,9 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
269275
}
270276

271277
bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
278+
if (!Widen16BitOps)
279+
return false;
280+
272281
const IntegerType *IntTy = dyn_cast<IntegerType>(T);
273282
if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
274283
return true;

llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll

Lines changed: 26 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
44

55
define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
66
; GCN-LABEL: s_andn2_i32:
@@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
196196
}
197197

198198
define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
199-
; GFX6-LABEL: s_andn2_i16:
200-
; GFX6: ; %bb.0:
201-
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
202-
; GFX6-NEXT: ; return to shader part epilog
203-
;
204-
; GFX9-LABEL: s_andn2_i16:
205-
; GFX9: ; %bb.0:
206-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
207-
; GFX9-NEXT: s_and_b32 s1, s3, s0
208-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
209-
; GFX9-NEXT: s_and_b32 s0, s2, s0
210-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
211-
; GFX9-NEXT: ; return to shader part epilog
199+
; GCN-LABEL: s_andn2_i16:
200+
; GCN: ; %bb.0:
201+
; GCN-NEXT: s_andn2_b32 s0, s2, s3
202+
; GCN-NEXT: ; return to shader part epilog
212203
%not.src1 = xor i16 %src1, -1
213204
%and = and i16 %src0, %not.src1
214205
ret i16 %and
215206
}
216207

217208
define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
218-
; GFX6-LABEL: s_andn2_i16_commute:
219-
; GFX6: ; %bb.0:
220-
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
221-
; GFX6-NEXT: ; return to shader part epilog
222-
;
223-
; GFX9-LABEL: s_andn2_i16_commute:
224-
; GFX9: ; %bb.0:
225-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
226-
; GFX9-NEXT: s_and_b32 s1, s3, s0
227-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
228-
; GFX9-NEXT: s_and_b32 s0, s0, s2
229-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
230-
; GFX9-NEXT: ; return to shader part epilog
209+
; GCN-LABEL: s_andn2_i16_commute:
210+
; GCN: ; %bb.0:
211+
; GCN-NEXT: s_andn2_b32 s0, s2, s3
212+
; GCN-NEXT: ; return to shader part epilog
231213
%not.src1 = xor i16 %src1, -1
232214
%and = and i16 %not.src1, %src0
233215
ret i16 %and
234216
}
235217

236218
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
237-
; GFX6-LABEL: s_andn2_i16_multi_use:
238-
; GFX6: ; %bb.0:
239-
; GFX6-NEXT: s_xor_b32 s1, s3, -1
240-
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
241-
; GFX6-NEXT: ; return to shader part epilog
242-
;
243-
; GFX9-LABEL: s_andn2_i16_multi_use:
244-
; GFX9: ; %bb.0:
245-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
246-
; GFX9-NEXT: s_and_b32 s1, s3, s0
247-
; GFX9-NEXT: s_xor_b32 s1, s1, s0
248-
; GFX9-NEXT: s_and_b32 s0, s2, s1
249-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
250-
; GFX9-NEXT: ; return to shader part epilog
219+
; GCN-LABEL: s_andn2_i16_multi_use:
220+
; GCN: ; %bb.0:
221+
; GCN-NEXT: s_xor_b32 s1, s3, -1
222+
; GCN-NEXT: s_andn2_b32 s0, s2, s3
223+
; GCN-NEXT: ; return to shader part epilog
251224
%not.src1 = xor i16 %src1, -1
252225
%and = and i16 %src0, %not.src1
253226
%insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
@@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
256229
}
257230

258231
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
259-
; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
260-
; GFX6: ; %bb.0:
261-
; GFX6-NEXT: s_andn2_b32 s0, s2, s4
262-
; GFX6-NEXT: s_andn2_b32 s1, s3, s4
263-
; GFX6-NEXT: ; return to shader part epilog
264-
;
265-
; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
266-
; GFX9: ; %bb.0:
267-
; GFX9-NEXT: s_mov_b32 s1, 0xffff
268-
; GFX9-NEXT: s_and_b32 s0, s4, s1
269-
; GFX9-NEXT: s_xor_b32 s0, s0, s1
270-
; GFX9-NEXT: s_and_b32 s2, s2, s1
271-
; GFX9-NEXT: s_and_b32 s4, s0, s1
272-
; GFX9-NEXT: s_and_b32 s1, s3, s1
273-
; GFX9-NEXT: s_and_b32 s0, s2, s4
274-
; GFX9-NEXT: s_and_b32 s1, s1, s4
275-
; GFX9-NEXT: ; return to shader part epilog
232+
; GCN-LABEL: s_andn2_i16_multi_foldable_use:
233+
; GCN: ; %bb.0:
234+
; GCN-NEXT: s_andn2_b32 s0, s2, s4
235+
; GCN-NEXT: s_andn2_b32 s1, s3, s4
236+
; GCN-NEXT: ; return to shader part epilog
276237
%not.src2 = xor i16 %src2, -1
277238
%and0 = and i16 %src0, %not.src2
278239
%and1 = and i16 %src1, %not.src2
@@ -308,21 +269,12 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
308269
}
309270

310271
define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
311-
; GFX6-LABEL: v_andn2_i16_vs:
312-
; GFX6: ; %bb.0:
313-
; GFX6-NEXT: s_xor_b32 s0, s2, -1
314-
; GFX6-NEXT: v_and_b32_e32 v0, s0, v0
315-
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
316-
; GFX6-NEXT: ; return to shader part epilog
317-
;
318-
; GFX9-LABEL: v_andn2_i16_vs:
319-
; GFX9: ; %bb.0:
320-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
321-
; GFX9-NEXT: s_and_b32 s1, s2, s0
322-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
323-
; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
324-
; GFX9-NEXT: v_bfe_u32 v0, v0, 0, 16
325-
; GFX9-NEXT: ; return to shader part epilog
272+
; GCN-LABEL: v_andn2_i16_vs:
273+
; GCN: ; %bb.0:
274+
; GCN-NEXT: s_xor_b32 s0, s2, -1
275+
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
276+
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
277+
; GCN-NEXT: ; return to shader part epilog
326278
%not.src1 = xor i16 %src1, -1
327279
%and = and i16 %src0, %not.src1
328280
%zext = zext i16 %and to i32

llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll

Lines changed: 26 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
44

55
define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) {
66
; GCN-LABEL: s_orn2_i32:
@@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
196196
}
197197

198198
define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
199-
; GFX6-LABEL: s_orn2_i16:
200-
; GFX6: ; %bb.0:
201-
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
202-
; GFX6-NEXT: ; return to shader part epilog
203-
;
204-
; GFX9-LABEL: s_orn2_i16:
205-
; GFX9: ; %bb.0:
206-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
207-
; GFX9-NEXT: s_and_b32 s1, s3, s0
208-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
209-
; GFX9-NEXT: s_or_b32 s0, s2, s0
210-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
211-
; GFX9-NEXT: ; return to shader part epilog
199+
; GCN-LABEL: s_orn2_i16:
200+
; GCN: ; %bb.0:
201+
; GCN-NEXT: s_orn2_b32 s0, s2, s3
202+
; GCN-NEXT: ; return to shader part epilog
212203
%not.src1 = xor i16 %src1, -1
213204
%or = or i16 %src0, %not.src1
214205
ret i16 %or
215206
}
216207

217208
define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
218-
; GFX6-LABEL: s_orn2_i16_commute:
219-
; GFX6: ; %bb.0:
220-
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
221-
; GFX6-NEXT: ; return to shader part epilog
222-
;
223-
; GFX9-LABEL: s_orn2_i16_commute:
224-
; GFX9: ; %bb.0:
225-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
226-
; GFX9-NEXT: s_and_b32 s1, s3, s0
227-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
228-
; GFX9-NEXT: s_or_b32 s0, s0, s2
229-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
230-
; GFX9-NEXT: ; return to shader part epilog
209+
; GCN-LABEL: s_orn2_i16_commute:
210+
; GCN: ; %bb.0:
211+
; GCN-NEXT: s_orn2_b32 s0, s2, s3
212+
; GCN-NEXT: ; return to shader part epilog
231213
%not.src1 = xor i16 %src1, -1
232214
%or = or i16 %not.src1, %src0
233215
ret i16 %or
234216
}
235217

236218
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
237-
; GFX6-LABEL: s_orn2_i16_multi_use:
238-
; GFX6: ; %bb.0:
239-
; GFX6-NEXT: s_xor_b32 s1, s3, -1
240-
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
241-
; GFX6-NEXT: ; return to shader part epilog
242-
;
243-
; GFX9-LABEL: s_orn2_i16_multi_use:
244-
; GFX9: ; %bb.0:
245-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
246-
; GFX9-NEXT: s_and_b32 s1, s3, s0
247-
; GFX9-NEXT: s_xor_b32 s1, s1, s0
248-
; GFX9-NEXT: s_or_b32 s0, s2, s1
249-
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000
250-
; GFX9-NEXT: ; return to shader part epilog
219+
; GCN-LABEL: s_orn2_i16_multi_use:
220+
; GCN: ; %bb.0:
221+
; GCN-NEXT: s_xor_b32 s1, s3, -1
222+
; GCN-NEXT: s_orn2_b32 s0, s2, s3
223+
; GCN-NEXT: ; return to shader part epilog
251224
%not.src1 = xor i16 %src1, -1
252225
%or = or i16 %src0, %not.src1
253226
%insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0
@@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
256229
}
257230

258231
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
259-
; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
260-
; GFX6: ; %bb.0:
261-
; GFX6-NEXT: s_orn2_b32 s0, s2, s4
262-
; GFX6-NEXT: s_orn2_b32 s1, s3, s4
263-
; GFX6-NEXT: ; return to shader part epilog
264-
;
265-
; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
266-
; GFX9: ; %bb.0:
267-
; GFX9-NEXT: s_mov_b32 s1, 0xffff
268-
; GFX9-NEXT: s_and_b32 s0, s4, s1
269-
; GFX9-NEXT: s_xor_b32 s0, s0, s1
270-
; GFX9-NEXT: s_and_b32 s2, s2, s1
271-
; GFX9-NEXT: s_and_b32 s4, s0, s1
272-
; GFX9-NEXT: s_and_b32 s1, s3, s1
273-
; GFX9-NEXT: s_or_b32 s0, s2, s4
274-
; GFX9-NEXT: s_or_b32 s1, s1, s4
275-
; GFX9-NEXT: ; return to shader part epilog
232+
; GCN-LABEL: s_orn2_i16_multi_foldable_use:
233+
; GCN: ; %bb.0:
234+
; GCN-NEXT: s_orn2_b32 s0, s2, s4
235+
; GCN-NEXT: s_orn2_b32 s1, s3, s4
236+
; GCN-NEXT: ; return to shader part epilog
276237
%not.src2 = xor i16 %src2, -1
277238
%or0 = or i16 %src0, %not.src2
278239
%or1 = or i16 %src1, %not.src2
@@ -308,21 +269,12 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) {
308269
}
309270

310271
define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
311-
; GFX6-LABEL: v_orn2_i16_vs:
312-
; GFX6: ; %bb.0:
313-
; GFX6-NEXT: s_xor_b32 s0, s2, -1
314-
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
315-
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
316-
; GFX6-NEXT: ; return to shader part epilog
317-
;
318-
; GFX9-LABEL: v_orn2_i16_vs:
319-
; GFX9: ; %bb.0:
320-
; GFX9-NEXT: s_mov_b32 s0, 0xffff
321-
; GFX9-NEXT: s_and_b32 s1, s2, s0
322-
; GFX9-NEXT: s_xor_b32 s0, s1, s0
323-
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
324-
; GFX9-NEXT: v_bfe_u32 v0, v0, 0, 16
325-
; GFX9-NEXT: ; return to shader part epilog
272+
; GCN-LABEL: v_orn2_i16_vs:
273+
; GCN: ; %bb.0:
274+
; GCN-NEXT: s_xor_b32 s0, s2, -1
275+
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
276+
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
277+
; GCN-NEXT: ; return to shader part epilog
326278
%not.src1 = xor i16 %src1, -1
327279
%or = or i16 %src0, %not.src1
328280
%zext = zext i16 %or to i32

0 commit comments

Comments
 (0)