Skip to content

Commit d9333e3

Browse files
committed
Revert "AMDGPU: Drop and auto-upgrade llvm.amdgcn.ldexp to llvm.ldexp"
This reverts commit 1159c67. Accidentally pushed wrong patch
1 parent 1159c67 commit d9333e3

File tree

9 files changed

+319
-40
lines changed

9 files changed

+319
-40
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,12 @@ def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
362362
def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic<
363363
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
364364

365+
// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used.
366+
def int_amdgcn_ldexp : DefaultAttrsIntrinsic<
367+
[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
368+
[IntrNoMem, IntrSpeculatable]
369+
>;
370+
365371
def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic<
366372
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
367373
>;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -836,13 +836,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
836836
{F->getReturnType()});
837837
return true;
838838
}
839-
if (Name.startswith("amdgcn.ldexp")) {
840-
// Target specific intrinsic became redundant
841-
NewFn = Intrinsic::getDeclaration(
842-
F->getParent(), Intrinsic::ldexp,
843-
{F->getReturnType(), F->getArg(1)->getType()});
844-
return true;
845-
}
846839

847840
break;
848841
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3220,7 +3220,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
32203220
case Intrinsic::amdgcn_rsq:
32213221
case Intrinsic::amdgcn_rcp_legacy:
32223222
case Intrinsic::amdgcn_rsq_legacy:
3223-
case Intrinsic::amdgcn_rsq_clamp: {
3223+
case Intrinsic::amdgcn_rsq_clamp:
3224+
case Intrinsic::amdgcn_ldexp: {
32243225
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
32253226
SDValue Src = N->getOperand(1);
32263227
return Src.isUndef() ? Src : SDValue();

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -996,6 +996,50 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
996996

997997
break;
998998
}
999+
case Intrinsic::amdgcn_ldexp: {
1000+
// FIXME: This doesn't introduce new instructions and belongs in
1001+
// InstructionSimplify.
1002+
Type *Ty = II.getType();
1003+
Value *Op0 = II.getArgOperand(0);
1004+
Value *Op1 = II.getArgOperand(1);
1005+
1006+
// Folding undef to qnan is safe regardless of the FP mode.
1007+
if (isa<UndefValue>(Op0)) {
1008+
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1009+
return IC.replaceInstUsesWith(II, QNaN);
1010+
}
1011+
1012+
const APFloat *C = nullptr;
1013+
match(Op0, PatternMatch::m_APFloat(C));
1014+
1015+
// FIXME: Should flush denorms depending on FP mode, but that's ignored
1016+
// everywhere else.
1017+
//
1018+
// These cases should be safe, even with strictfp.
1019+
// ldexp(0.0, x) -> 0.0
1020+
// ldexp(-0.0, x) -> -0.0
1021+
// ldexp(inf, x) -> inf
1022+
// ldexp(-inf, x) -> -inf
1023+
if (C && (C->isZero() || C->isInfinity())) {
1024+
return IC.replaceInstUsesWith(II, Op0);
1025+
}
1026+
1027+
// With strictfp, be more careful about possibly needing to flush denormals
1028+
// or not, and snan behavior depends on ieee_mode.
1029+
if (II.isStrictFP())
1030+
break;
1031+
1032+
if (C && C->isNaN())
1033+
return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet()));
1034+
1035+
// ldexp(x, 0) -> x
1036+
// ldexp(x, undef) -> x
1037+
if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
1038+
return IC.replaceInstUsesWith(II, Op0);
1039+
}
1040+
1041+
break;
1042+
}
9991043
case Intrinsic::amdgcn_fmul_legacy: {
10001044
Value *Op0 = II.getArgOperand(0);
10011045
Value *Op1 = II.getArgOperand(1);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7223,6 +7223,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
72237223

72247224
return emitRemovedIntrinsicError(DAG, DL, VT);
72257225
}
7226+
case Intrinsic::amdgcn_ldexp:
7227+
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2));
7228+
72267229
case Intrinsic::amdgcn_fract:
72277230
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
72287231

@@ -10669,6 +10672,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
1066910672
case Intrinsic::amdgcn_div_fmas:
1067010673
case Intrinsic::amdgcn_div_fixup:
1067110674
case Intrinsic::amdgcn_fract:
10675+
case Intrinsic::amdgcn_ldexp:
1067210676
case Intrinsic::amdgcn_cvt_pkrtz:
1067310677
case Intrinsic::amdgcn_cubeid:
1067410678
case Intrinsic::amdgcn_cubema:

llvm/test/Bitcode/amdgcn-ldexp.ll

Lines changed: 0 additions & 30 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/known-never-snan.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ define float @v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32(float %a, i32 %b
516516
; GCN-NEXT: v_ldexp_f32 v0, v0, v1
517517
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
518518
; GCN-NEXT: s_setpc_b64 s[30:31]
519-
%known.not.snan = call float @llvm.ldexp.f32.i32(float %a, i32 %b)
519+
%known.not.snan = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b)
520520
%max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
521521
%med = call float @llvm.minnum.f32(float %max, float 4.0)
522522
ret float %med
@@ -658,7 +658,7 @@ declare float @llvm.maxnum.f32(float, float) #1
658658
declare float @llvm.copysign.f32(float, float) #1
659659
declare float @llvm.fma.f32(float, float, float) #1
660660
declare float @llvm.fmuladd.f32(float, float, float) #1
661-
declare float @llvm.ldexp.f32.i32(float, i32) #1
661+
declare float @llvm.amdgcn.ldexp.f32(float, i32) #1
662662
declare float @llvm.amdgcn.fmul.legacy(float, float) #1
663663
declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
664664
declare float @llvm.amdgcn.frexp.mant.f32(float) #1
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
5+
6+
declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
7+
8+
define amdgpu_kernel void @ldexp_f16(
9+
; VI-LABEL: ldexp_f16:
10+
; VI: ; %bb.0:
11+
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
12+
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
13+
; VI-NEXT: s_mov_b32 s3, 0xf000
14+
; VI-NEXT: s_mov_b32 s2, -1
15+
; VI-NEXT: s_mov_b32 s10, s2
16+
; VI-NEXT: s_mov_b32 s11, s3
17+
; VI-NEXT: s_waitcnt lgkmcnt(0)
18+
; VI-NEXT: s_mov_b32 s12, s6
19+
; VI-NEXT: s_mov_b32 s13, s7
20+
; VI-NEXT: s_mov_b32 s14, s2
21+
; VI-NEXT: s_mov_b32 s15, s3
22+
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
23+
; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
24+
; VI-NEXT: s_mov_b32 s0, s4
25+
; VI-NEXT: s_movk_i32 s4, 0x8000
26+
; VI-NEXT: v_mov_b32_e32 v2, 0x7fff
27+
; VI-NEXT: s_mov_b32 s1, s5
28+
; VI-NEXT: s_waitcnt vmcnt(1)
29+
; VI-NEXT: v_med3_i32 v0, v0, s4, v2
30+
; VI-NEXT: s_waitcnt vmcnt(0)
31+
; VI-NEXT: v_ldexp_f16_e32 v0, v1, v0
32+
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
33+
; VI-NEXT: s_endpgm
34+
;
35+
; GFX10-LABEL: ldexp_f16:
36+
; GFX10: ; %bb.0:
37+
; GFX10-NEXT: s_clause 0x1
38+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
39+
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
40+
; GFX10-NEXT: s_mov_b32 s2, -1
41+
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
42+
; GFX10-NEXT: s_mov_b32 s10, s2
43+
; GFX10-NEXT: s_mov_b32 s11, s3
44+
; GFX10-NEXT: s_mov_b32 s14, s2
45+
; GFX10-NEXT: s_mov_b32 s15, s3
46+
; GFX10-NEXT: s_movk_i32 s0, 0x8000
47+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
48+
; GFX10-NEXT: s_mov_b32 s12, s6
49+
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
50+
; GFX10-NEXT: s_mov_b32 s13, s7
51+
; GFX10-NEXT: s_mov_b32 s1, s5
52+
; GFX10-NEXT: buffer_load_ushort v1, off, s[12:15], 0
53+
; GFX10-NEXT: s_waitcnt vmcnt(1)
54+
; GFX10-NEXT: v_med3_i32 v0, v0, s0, 0x7fff
55+
; GFX10-NEXT: s_mov_b32 s0, s4
56+
; GFX10-NEXT: s_waitcnt vmcnt(0)
57+
; GFX10-NEXT: v_ldexp_f16_e32 v0, v1, v0
58+
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
59+
; GFX10-NEXT: s_endpgm
60+
;
61+
; GFX11-LABEL: ldexp_f16:
62+
; GFX11: ; %bb.0:
63+
; GFX11-NEXT: s_clause 0x1
64+
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
65+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
66+
; GFX11-NEXT: s_mov_b32 s10, -1
67+
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
68+
; GFX11-NEXT: s_mov_b32 s2, s10
69+
; GFX11-NEXT: s_mov_b32 s3, s11
70+
; GFX11-NEXT: s_mov_b32 s14, s10
71+
; GFX11-NEXT: s_mov_b32 s15, s11
72+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
73+
; GFX11-NEXT: s_mov_b32 s12, s6
74+
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
75+
; GFX11-NEXT: s_mov_b32 s13, s7
76+
; GFX11-NEXT: s_movk_i32 s0, 0x8000
77+
; GFX11-NEXT: buffer_load_u16 v1, off, s[12:15], 0
78+
; GFX11-NEXT: s_mov_b32 s8, s4
79+
; GFX11-NEXT: s_mov_b32 s9, s5
80+
; GFX11-NEXT: s_waitcnt vmcnt(1)
81+
; GFX11-NEXT: v_med3_i32 v0, v0, s0, 0x7fff
82+
; GFX11-NEXT: s_waitcnt vmcnt(0)
83+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
84+
; GFX11-NEXT: v_ldexp_f16_e32 v0, v1, v0
85+
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
86+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
87+
; GFX11-NEXT: s_endpgm
88+
ptr addrspace(1) %r,
89+
ptr addrspace(1) %a,
90+
ptr addrspace(1) %b) {
91+
%a.val = load half, ptr addrspace(1) %a
92+
%b.val = load i32, ptr addrspace(1) %b
93+
%r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i32 %b.val)
94+
store half %r.val, ptr addrspace(1) %r
95+
ret void
96+
}
97+
98+
define amdgpu_kernel void @ldexp_f16_imm_a(
99+
; VI-LABEL: ldexp_f16_imm_a:
100+
; VI: ; %bb.0:
101+
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
102+
; VI-NEXT: s_mov_b32 s7, 0xf000
103+
; VI-NEXT: s_mov_b32 s6, -1
104+
; VI-NEXT: s_mov_b32 s10, s6
105+
; VI-NEXT: s_mov_b32 s11, s7
106+
; VI-NEXT: s_waitcnt lgkmcnt(0)
107+
; VI-NEXT: s_mov_b32 s8, s2
108+
; VI-NEXT: s_mov_b32 s9, s3
109+
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
110+
; VI-NEXT: s_mov_b32 s4, s0
111+
; VI-NEXT: s_movk_i32 s0, 0x8000
112+
; VI-NEXT: v_mov_b32_e32 v1, 0x7fff
113+
; VI-NEXT: s_mov_b32 s5, s1
114+
; VI-NEXT: s_waitcnt vmcnt(0)
115+
; VI-NEXT: v_med3_i32 v0, v0, s0, v1
116+
; VI-NEXT: v_ldexp_f16_e32 v0, 2.0, v0
117+
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
118+
; VI-NEXT: s_endpgm
119+
;
120+
; GFX10-LABEL: ldexp_f16_imm_a:
121+
; GFX10: ; %bb.0:
122+
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
123+
; GFX10-NEXT: s_mov_b32 s6, -1
124+
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
125+
; GFX10-NEXT: s_mov_b32 s10, s6
126+
; GFX10-NEXT: s_mov_b32 s11, s7
127+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
128+
; GFX10-NEXT: s_mov_b32 s8, s2
129+
; GFX10-NEXT: s_mov_b32 s9, s3
130+
; GFX10-NEXT: s_movk_i32 s2, 0x8000
131+
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
132+
; GFX10-NEXT: s_mov_b32 s4, s0
133+
; GFX10-NEXT: s_mov_b32 s5, s1
134+
; GFX10-NEXT: s_waitcnt vmcnt(0)
135+
; GFX10-NEXT: v_med3_i32 v0, v0, s2, 0x7fff
136+
; GFX10-NEXT: v_ldexp_f16_e32 v0, 2.0, v0
137+
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
138+
; GFX10-NEXT: s_endpgm
139+
;
140+
; GFX11-LABEL: ldexp_f16_imm_a:
141+
; GFX11: ; %bb.0:
142+
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
143+
; GFX11-NEXT: s_mov_b32 s6, -1
144+
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
145+
; GFX11-NEXT: s_mov_b32 s10, s6
146+
; GFX11-NEXT: s_mov_b32 s11, s7
147+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
148+
; GFX11-NEXT: s_mov_b32 s8, s2
149+
; GFX11-NEXT: s_mov_b32 s9, s3
150+
; GFX11-NEXT: s_movk_i32 s2, 0x8000
151+
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
152+
; GFX11-NEXT: s_mov_b32 s4, s0
153+
; GFX11-NEXT: s_mov_b32 s5, s1
154+
; GFX11-NEXT: s_waitcnt vmcnt(0)
155+
; GFX11-NEXT: v_med3_i32 v0, v0, s2, 0x7fff
156+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
157+
; GFX11-NEXT: v_ldexp_f16_e32 v0, 2.0, v0
158+
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
159+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
160+
; GFX11-NEXT: s_endpgm
161+
ptr addrspace(1) %r,
162+
ptr addrspace(1) %b) {
163+
%b.val = load i32, ptr addrspace(1) %b
164+
%r.val = call half @llvm.amdgcn.ldexp.f16(half 2.0, i32 %b.val)
165+
store half %r.val, ptr addrspace(1) %r
166+
ret void
167+
}
168+
169+
define amdgpu_kernel void @ldexp_f16_imm_b(
170+
; VI-LABEL: ldexp_f16_imm_b:
171+
; VI: ; %bb.0:
172+
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
173+
; VI-NEXT: s_mov_b32 s7, 0xf000
174+
; VI-NEXT: s_mov_b32 s6, -1
175+
; VI-NEXT: s_mov_b32 s10, s6
176+
; VI-NEXT: s_mov_b32 s11, s7
177+
; VI-NEXT: s_waitcnt lgkmcnt(0)
178+
; VI-NEXT: s_mov_b32 s8, s2
179+
; VI-NEXT: s_mov_b32 s9, s3
180+
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
181+
; VI-NEXT: s_mov_b32 s4, s0
182+
; VI-NEXT: s_mov_b32 s5, s1
183+
; VI-NEXT: s_waitcnt vmcnt(0)
184+
; VI-NEXT: v_ldexp_f16_e64 v0, v0, 2
185+
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
186+
; VI-NEXT: s_endpgm
187+
;
188+
; GFX10-LABEL: ldexp_f16_imm_b:
189+
; GFX10: ; %bb.0:
190+
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
191+
; GFX10-NEXT: s_mov_b32 s6, -1
192+
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
193+
; GFX10-NEXT: s_mov_b32 s10, s6
194+
; GFX10-NEXT: s_mov_b32 s11, s7
195+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
196+
; GFX10-NEXT: s_mov_b32 s8, s2
197+
; GFX10-NEXT: s_mov_b32 s9, s3
198+
; GFX10-NEXT: s_mov_b32 s4, s0
199+
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
200+
; GFX10-NEXT: s_mov_b32 s5, s1
201+
; GFX10-NEXT: s_waitcnt vmcnt(0)
202+
; GFX10-NEXT: v_ldexp_f16_e64 v0, v0, 2
203+
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
204+
; GFX10-NEXT: s_endpgm
205+
;
206+
; GFX11-LABEL: ldexp_f16_imm_b:
207+
; GFX11: ; %bb.0:
208+
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
209+
; GFX11-NEXT: s_mov_b32 s6, -1
210+
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
211+
; GFX11-NEXT: s_mov_b32 s10, s6
212+
; GFX11-NEXT: s_mov_b32 s11, s7
213+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
214+
; GFX11-NEXT: s_mov_b32 s8, s2
215+
; GFX11-NEXT: s_mov_b32 s9, s3
216+
; GFX11-NEXT: s_mov_b32 s4, s0
217+
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
218+
; GFX11-NEXT: s_mov_b32 s5, s1
219+
; GFX11-NEXT: s_waitcnt vmcnt(0)
220+
; GFX11-NEXT: v_ldexp_f16_e64 v0, v0, 2
221+
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
222+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
223+
; GFX11-NEXT: s_endpgm
224+
ptr addrspace(1) %r,
225+
ptr addrspace(1) %a) {
226+
%a.val = load half, ptr addrspace(1) %a
227+
%r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i32 2)
228+
store half %r.val, ptr addrspace(1) %r
229+
ret void
230+
}

0 commit comments

Comments
 (0)