@@ -82,8 +82,12 @@ class AMDGPULibCalls {
82
82
// sqrt
83
83
bool fold_sqrt (FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
84
84
85
- bool insertSinCos (CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
86
- const FuncInfo &FInfo);
85
+ // / Insert a value to sincos function \p Fsincos. Returns (value of sin, value
86
+ // / of cos, sincos call).
87
+ std::tuple<Value *, Value *, Value *> insertSinCos (Value *Arg,
88
+ FastMathFlags FMF,
89
+ IRBuilder<> &B,
90
+ FunctionCallee Fsincos);
87
91
88
92
// sin/cos
89
93
bool fold_sincos (FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
@@ -1041,40 +1045,24 @@ bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
1041
1045
return false ;
1042
1046
}
1043
1047
1044
- bool AMDGPULibCalls::insertSinCos (CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
1045
- const FuncInfo &fInfo ) {
1046
- Value *Arg = Sin->getOperand (0 );
1047
- assert (Arg == Cos->getOperand (0 ));
1048
-
1048
+ std::tuple<Value *, Value *, Value *>
1049
+ AMDGPULibCalls::insertSinCos (Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1050
+ FunctionCallee Fsincos) {
1051
+ DebugLoc DL = B.getCurrentDebugLocation ();
1049
1052
Function *F = B.GetInsertBlock ()->getParent ();
1050
- Module *M = F->getParent ();
1051
- // Merge the sin and cos.
1052
-
1053
- // for OpenCL 2.0 we have only generic implementation of sincos
1054
- // function.
1055
- // FIXME: This is not true anymore
1056
- AMDGPULibFunc nf (AMDGPULibFunc::EI_SINCOS, fInfo );
1057
- nf.getLeads ()[0 ].PtrKind =
1058
- AMDGPULibFunc::getEPtrKindFromAddrSpace (AMDGPUAS::FLAT_ADDRESS);
1059
- FunctionCallee Fsincos = getFunction (M, nf);
1060
- if (!Fsincos)
1061
- return false ;
1062
-
1063
1053
B.SetInsertPointPastAllocas (F);
1064
1054
1065
- DILocation *MergedDebugLoc =
1066
- DILocation::getMergedLocation (Sin->getDebugLoc (), Cos->getDebugLoc ());
1067
- B.SetCurrentDebugLocation (MergedDebugLoc);
1068
-
1069
- AllocaInst *Alloc = B.CreateAlloca (Sin->getType (), nullptr , " __sincos_" );
1055
+ AllocaInst *Alloc = B.CreateAlloca (Arg->getType (), nullptr , " __sincos_" );
1070
1056
1071
1057
if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
1072
1058
// If the argument is an instruction, it must dominate all uses so put our
1073
1059
// sincos call there. Otherwise, right after the allocas works well enough
1074
1060
// if it's an argument or constant.
1075
1061
1076
1062
B.SetInsertPoint (ArgInst->getParent (), ++ArgInst->getIterator ());
1077
- B.SetCurrentDebugLocation (MergedDebugLoc);
1063
+
1064
+ // SetInsertPoint unwelcomely always tries to set the debug loc.
1065
+ B.SetCurrentDebugLocation (DL);
1078
1066
}
1079
1067
1080
1068
Value *P = Alloc;
@@ -1085,25 +1073,12 @@ bool AMDGPULibCalls::insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
1085
1073
if (PTy->getPointerAddressSpace () != AMDGPUAS::PRIVATE_ADDRESS)
1086
1074
P = B.CreateAddrSpaceCast (Alloc, PTy);
1087
1075
1088
- // Intersect the two sets of flags.
1089
- FastMathFlags FMF = cast<FPMathOperator>(Sin)->getFastMathFlags ();
1090
- FMF &= cast<FPMathOperator>(Cos)->getFastMathFlags ();
1091
- B.setFastMathFlags (FMF);
1092
-
1093
- CallInst *Call = CreateCallEx2 (B, Fsincos, Arg, P);
1094
- LoadInst *Reload = B.CreateLoad (Alloc->getAllocatedType (), Alloc);
1095
- Reload->setDebugLoc (Cos->getDebugLoc ());
1096
-
1097
- LLVM_DEBUG (errs () << " AMDIC: fold_sincos (" << *Sin << " , " << *Cos
1098
- << " ) with " << *Call << ' \n ' );
1099
-
1100
- Sin->replaceAllUsesWith (Call);
1101
- Sin->eraseFromParent ();
1102
-
1103
- Cos->replaceAllUsesWith (Reload);
1104
- Cos->eraseFromParent ();
1076
+ CallInst *SinCos = CreateCallEx2 (B, Fsincos, Arg, P);
1105
1077
1106
- return true ;
1078
+ // TODO: Is it worth trying to preserve the location for the cos calls for the
1079
+ // load?
1080
+ LoadInst *LoadCos = B.CreateLoad (Alloc->getAllocatedType (), Alloc);
1081
+ return {SinCos, LoadCos, SinCos};
1107
1082
}
1108
1083
1109
1084
// fold sin, cos -> sincos.
@@ -1121,33 +1096,92 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1121
1096
1122
1097
Value *CArgVal = FPOp->getOperand (0 );
1123
1098
CallInst *CI = cast<CallInst>(FPOp);
1124
- bool Changed = false ;
1125
1099
1100
+ Function *F = B.GetInsertBlock ()->getParent ();
1101
+ Module *M = F->getParent ();
1102
+
1103
+ // Merge the sin and cos.
1104
+
1105
+ // for OpenCL 2.0 we have only generic implementation of sincos
1106
+ // function.
1107
+ // FIXME: This is not true anymore
1108
+ AMDGPULibFunc SinCosLibFunc (AMDGPULibFunc::EI_SINCOS, fInfo );
1109
+ SinCosLibFunc.getLeads ()[0 ].PtrKind =
1110
+ AMDGPULibFunc::getEPtrKindFromAddrSpace (AMDGPUAS::FLAT_ADDRESS);
1111
+ FunctionCallee FSinCos = getFunction (M, SinCosLibFunc);
1112
+ if (!FSinCos)
1113
+ return false ;
1114
+
1115
+ SmallVector<CallInst *> SinCalls;
1116
+ SmallVector<CallInst *> CosCalls;
1117
+ SmallVector<CallInst *> SinCosCalls;
1126
1118
FuncInfo PartnerInfo (isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1127
1119
fInfo );
1128
1120
const std::string PairName = PartnerInfo.mangle ();
1129
1121
1130
- CallInst *UI = nullptr ;
1122
+ StringRef SinName = isSin ? CI->getCalledFunction ()->getName () : PairName;
1123
+ StringRef CosName = isSin ? PairName : CI->getCalledFunction ()->getName ();
1124
+ const std::string SinCosName = SinCosLibFunc.mangle ();
1125
+
1126
+ // Intersect the two sets of flags.
1127
+ FastMathFlags FMF = FPOp->getFastMathFlags ();
1128
+ MDNode *FPMath = CI->getMetadata (LLVMContext::MD_fpmath);
1129
+
1130
+ SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc ()};
1131
1131
1132
- // TODO: Handle repeated uses, the generic implementation does.
1133
1132
for (User* U : CArgVal->users ()) {
1134
1133
CallInst *XI = dyn_cast<CallInst>(U);
1135
- if (!XI || XI->isNoBuiltin ())
1134
+ if (!XI || XI->getFunction () != F || XI-> isNoBuiltin ())
1136
1135
continue ;
1137
1136
1138
1137
Function *UCallee = XI->getCalledFunction ();
1139
- if (UCallee && UCallee->getName ().equals (PairName))
1140
- UI = XI;
1141
- else if (UI)
1142
- return Changed;
1138
+ if (!UCallee)
1139
+ continue ;
1140
+
1141
+ bool Handled = true ;
1142
+
1143
+ if (UCallee->getName () == SinName)
1144
+ SinCalls.push_back (XI);
1145
+ else if (UCallee->getName () == CosName)
1146
+ CosCalls.push_back (XI);
1147
+ else if (UCallee->getName () == SinCosName)
1148
+ SinCosCalls.push_back (XI);
1149
+ else
1150
+ Handled = false ;
1151
+
1152
+ if (Handled) {
1153
+ MergeDbgLocs.push_back (XI->getDebugLoc ());
1154
+ auto *OtherOp = cast<FPMathOperator>(XI);
1155
+ FMF &= OtherOp->getFastMathFlags ();
1156
+ FPMath = MDNode::getMostGenericFPMath (
1157
+ FPMath, XI->getMetadata (LLVMContext::MD_fpmath));
1158
+ }
1143
1159
}
1144
1160
1145
- if (!UI)
1146
- return Changed;
1161
+ if (SinCalls.empty () || CosCalls.empty ())
1162
+ return false ;
1163
+
1164
+ B.setFastMathFlags (FMF);
1165
+ B.setDefaultFPMathTag (FPMath);
1166
+ DILocation *DbgLoc = DILocation::getMergedLocations (MergeDbgLocs);
1167
+ B.SetCurrentDebugLocation (DbgLoc);
1168
+
1169
+ auto [Sin, Cos, SinCos] = insertSinCos (CArgVal, FMF, B, FSinCos);
1170
+
1171
+ auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1172
+ for (CallInst *C : Calls)
1173
+ C->replaceAllUsesWith (Res);
1174
+
1175
+ // Leave the other dead instructions to avoid clobbering iterators.
1176
+ };
1147
1177
1148
- CallInst *Sin = isSin ? CI : UI;
1149
- CallInst *Cos = isSin ? UI : CI;
1150
- return insertSinCos (Sin, Cos, B, fInfo ) || Changed;
1178
+ replaceTrigInsts (SinCalls, Sin);
1179
+ replaceTrigInsts (CosCalls, Cos);
1180
+ replaceTrigInsts (SinCosCalls, SinCos);
1181
+
1182
+ // It's safe to delete the original now.
1183
+ CI->eraseFromParent ();
1184
+ return true ;
1151
1185
}
1152
1186
1153
1187
bool AMDGPULibCalls::evaluateScalarMathFunc (const FuncInfo &FInfo,
0 commit comments