Skip to content

Commit f44beec

Browse files
committed
AMDGPU: Try to use private version of sincos if available
The comment was out of date, the device libs build does provide all the pointer overloads. An extremely pedantic interpretation of the spec would suggest only the flat version exists, but the overloads do exist in the implementation. https://reviews.llvm.org/D156720
1 parent 9f369a4 commit f44beec

File tree

5 files changed

+206
-245
lines changed

5 files changed

+206
-245
lines changed

llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,18 +1065,18 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
10651065
B.SetCurrentDebugLocation(DL);
10661066
}
10671067

1068-
Value *P = Alloc;
1069-
Type *PTy = Fsincos.getFunctionType()->getParamType(1);
1068+
Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);
1069+
10701070
// The allocaInst allocates the memory in private address space. This need
1071-
// to be bitcasted to point to the address space of cos pointer type.
1071+
// to be addrspacecasted to point to the address space of cos pointer type.
10721072
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1073-
if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
1074-
P = B.CreateAddrSpaceCast(Alloc, PTy);
1073+
Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);
10751074

1076-
CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, P);
1075+
CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);
10771076

10781077
// TODO: Is it worth trying to preserve the location for the cos calls for the
10791078
// load?
1079+
10801080
LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
10811081
return {SinCos, LoadCos, SinCos};
10821082
}
@@ -1100,15 +1100,19 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11001100
Function *F = B.GetInsertBlock()->getParent();
11011101
Module *M = F->getParent();
11021102

1103-
// Merge the sin and cos.
1103+
// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1104+
// implementation. Prefer the private form if available.
1105+
AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1106+
SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1107+
AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);
11041108

1105-
// for OpenCL 2.0 we have only generic implementation of sincos
1106-
// function.
1107-
// FIXME: This is not true anymore
1108-
AMDGPULibFunc SinCosLibFunc(AMDGPULibFunc::EI_SINCOS, fInfo);
1109-
SinCosLibFunc.getLeads()[0].PtrKind =
1109+
AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1110+
SinCosLibFuncGeneric.getLeads()[0].PtrKind =
11101111
AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
1111-
FunctionCallee FSinCos = getFunction(M, SinCosLibFunc);
1112+
1113+
FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
1114+
FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
1115+
FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
11121116
if (!FSinCos)
11131117
return false;
11141118

@@ -1121,7 +1125,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11211125

11221126
StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
11231127
StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1124-
const std::string SinCosName = SinCosLibFunc.mangle();
1128+
const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1129+
const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
11251130

11261131
// Intersect the two sets of flags.
11271132
FastMathFlags FMF = FPOp->getFastMathFlags();
@@ -1144,7 +1149,8 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11441149
SinCalls.push_back(XI);
11451150
else if (UCallee->getName() == CosName)
11461151
CosCalls.push_back(XI);
1147-
else if (UCallee->getName() == SinCosName)
1152+
else if (UCallee->getName() == SinCosPrivateName ||
1153+
UCallee->getName() == SinCosGenericName)
11481154
SinCosCalls.push_back(XI);
11491155
else
11501156
Handled = false;

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
106106
; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
107107
; CHECK-NEXT: entry:
108108
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
109-
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
110-
; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
111-
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
112-
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
109+
; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
110+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
111+
; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
113112
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
114-
; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
113+
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
115114
; CHECK-NEXT: ret void
116115
;
117116
entry:
@@ -127,12 +126,11 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
127126
; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
128127
; CHECK-NEXT: entry:
129128
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
130-
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
131-
; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
132-
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
133-
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
129+
; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float 4.200000e+01, ptr addrspace(5) [[__SINCOS_]])
130+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
131+
; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
134132
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
135-
; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
133+
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
136134
; CHECK-NEXT: ret void
137135
;
138136
entry:
@@ -157,12 +155,11 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
157155
; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
158156
; CHECK-NEXT: entry:
159157
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
160-
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
161-
; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
162-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
163-
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
158+
; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
159+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
160+
; CHECK-NEXT: store <2 x float> [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 8
164161
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
165-
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
162+
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 8
166163
; CHECK-NEXT: ret void
167164
;
168165
entry:

0 commit comments

Comments
 (0)