Skip to content

Commit 2932259

Browse files
committed
[AMDGPU] MoveToVALU for pseudo scalar transcendental instructions
1 parent beceb75 commit 2932259

File tree

2 files changed

+229
-1
lines changed

2 files changed

+229
-1
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5305,6 +5305,16 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
53055305
case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
53065306
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
53075307
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5308+
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5309+
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
5310+
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5311+
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
5312+
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5313+
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
5314+
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5315+
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
5316+
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5317+
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
53085318
}
53095319
llvm_unreachable(
53105320
"Unexpected scalar opcode without corresponding vector one!");
@@ -7189,7 +7199,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
71897199
// Use the new VALU Opcode.
71907200
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
71917201
.setMIFlags(Inst.getFlags());
7192-
if (isVOP3(NewOpcode)) {
7202+
if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
71937203
// Intersperse VOP3 modifiers among the SALU operands.
71947204
NewInstr->addOperand(Inst.getOperand(0));
71957205
if (AMDGPU::getNamedOperandIdx(NewOpcode,
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -march=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
3+
4+
define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
5+
; CHECK-LABEL: name: exp_f32
6+
; CHECK: bb.0 (%ir-block.0):
7+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
8+
; CHECK-NEXT: {{ $}}
9+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
10+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
11+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
12+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
13+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14+
; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
15+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
16+
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
17+
; CHECK-NEXT: S_ENDPGM 0
18+
%val = load volatile float, ptr addrspace(1) %ptr
19+
%res = call float @llvm.amdgcn.exp2.f32(float %val)
20+
store float %res, ptr addrspace(1) %ptr
21+
ret void
22+
}
23+
24+
define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
25+
; CHECK-LABEL: name: exp_f16
26+
; CHECK: bb.0 (%ir-block.0):
27+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
28+
; CHECK-NEXT: {{ $}}
29+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
30+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
31+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
32+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
33+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
35+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
36+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_t16_e64_]]
37+
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
38+
; CHECK-NEXT: S_ENDPGM 0
39+
%val = load volatile half, ptr addrspace(1) %ptr
40+
%res = call half @llvm.amdgcn.exp2.f16(half %val)
41+
store half %res, ptr addrspace(1) %ptr
42+
ret void
43+
}
44+
45+
define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
46+
; CHECK-LABEL: name: log_f32
47+
; CHECK: bb.0 (%ir-block.0):
48+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
49+
; CHECK-NEXT: {{ $}}
50+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
51+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
52+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
53+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
54+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
55+
; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
56+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
57+
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
58+
; CHECK-NEXT: S_ENDPGM 0
59+
%val = load volatile float, ptr addrspace(1) %ptr
60+
%res = call float @llvm.amdgcn.log.f32(float %val)
61+
store float %res, ptr addrspace(1) %ptr
62+
ret void
63+
}
64+
65+
define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
66+
; CHECK-LABEL: name: log_f16
67+
; CHECK: bb.0 (%ir-block.0):
68+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
69+
; CHECK-NEXT: {{ $}}
70+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
71+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
72+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
73+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
74+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
75+
; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
76+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
77+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_t16_e64_]]
78+
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
79+
; CHECK-NEXT: S_ENDPGM 0
80+
%val = load volatile half, ptr addrspace(1) %ptr
81+
%res = call half @llvm.amdgcn.log.f16(half %val)
82+
store half %res, ptr addrspace(1) %ptr
83+
ret void
84+
}
85+
86+
define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
87+
; CHECK-LABEL: name: rcp_f32
88+
; CHECK: bb.0 (%ir-block.0):
89+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
90+
; CHECK-NEXT: {{ $}}
91+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
92+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
93+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
94+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
95+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
96+
; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
97+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
98+
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
99+
; CHECK-NEXT: S_ENDPGM 0
100+
%val = load volatile float, ptr addrspace(1) %ptr
101+
%res = call float @llvm.amdgcn.rcp.f32(float %val)
102+
store float %res, ptr addrspace(1) %ptr
103+
ret void
104+
}
105+
106+
define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
107+
; CHECK-LABEL: name: rcp_f16
108+
; CHECK: bb.0 (%ir-block.0):
109+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
110+
; CHECK-NEXT: {{ $}}
111+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
112+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
113+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
114+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
115+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
116+
; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
117+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
118+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_t16_e64_]]
119+
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
120+
; CHECK-NEXT: S_ENDPGM 0
121+
%val = load volatile half, ptr addrspace(1) %ptr
122+
%res = call half @llvm.amdgcn.rcp.f16(half %val)
123+
store half %res, ptr addrspace(1) %ptr
124+
ret void
125+
}
126+
127+
define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
128+
; CHECK-LABEL: name: rsq_f32
129+
; CHECK: bb.0 (%ir-block.0):
130+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
131+
; CHECK-NEXT: {{ $}}
132+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
133+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
134+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
135+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
136+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
137+
; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
138+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
139+
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
140+
; CHECK-NEXT: S_ENDPGM 0
141+
%val = load volatile float, ptr addrspace(1) %ptr
142+
%res = call float @llvm.amdgcn.rsq.f32(float %val)
143+
store float %res, ptr addrspace(1) %ptr
144+
ret void
145+
}
146+
147+
define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
148+
; CHECK-LABEL: name: rsq_f16
149+
; CHECK: bb.0 (%ir-block.0):
150+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
151+
; CHECK-NEXT: {{ $}}
152+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
153+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
154+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
155+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
156+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
157+
; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
158+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
159+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_t16_e64_]]
160+
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
161+
; CHECK-NEXT: S_ENDPGM 0
162+
%val = load volatile half, ptr addrspace(1) %ptr
163+
%res = call half @llvm.amdgcn.rsq.f16(half %val)
164+
store half %res, ptr addrspace(1) %ptr
165+
ret void
166+
}
167+
168+
define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
169+
; CHECK-LABEL: name: sqrt_f32
170+
; CHECK: bb.0 (%ir-block.0):
171+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
172+
; CHECK-NEXT: {{ $}}
173+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
174+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
175+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
176+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
177+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
178+
; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
179+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
180+
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
181+
; CHECK-NEXT: S_ENDPGM 0
182+
%val = load volatile float, ptr addrspace(1) %ptr
183+
%res = call float @llvm.amdgcn.sqrt.f32(float %val)
184+
store float %res, ptr addrspace(1) %ptr
185+
ret void
186+
}
187+
188+
define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
189+
; CHECK-LABEL: name: sqrt_f16
190+
; CHECK: bb.0 (%ir-block.0):
191+
; CHECK-NEXT: liveins: $sgpr0_sgpr1
192+
; CHECK-NEXT: {{ $}}
193+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
194+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
195+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
196+
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
197+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
198+
; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
199+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
200+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_t16_e64_]]
201+
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
202+
; CHECK-NEXT: S_ENDPGM 0
203+
%val = load volatile half, ptr addrspace(1) %ptr
204+
%res = call half @llvm.amdgcn.sqrt.f16(half %val)
205+
store half %res, ptr addrspace(1) %ptr
206+
ret void
207+
}
208+
209+
declare float @llvm.amdgcn.exp2.f32(float)
210+
declare half @llvm.amdgcn.exp2.f16(half)
211+
declare float @llvm.amdgcn.log.f32(float)
212+
declare half @llvm.amdgcn.log.f16(half)
213+
declare float @llvm.amdgcn.rcp.f32(float)
214+
declare half @llvm.amdgcn.rcp.f16(half)
215+
declare float @llvm.amdgcn.rsq.f32(float)
216+
declare half @llvm.amdgcn.rsq.f16(half)
217+
declare float @llvm.amdgcn.sqrt.f32(float)
218+
declare half @llvm.amdgcn.sqrt.f16(half)

0 commit comments

Comments
 (0)