Skip to content

Commit 193b4bc

Browse files
authored
Correct some cuda/sycl accuracy from 0.0 to 0.5 and add relate handling. (#12213)
Several functions for "sycl" and "cuda" requires "0.5" accuracy levels, which means correctly rounded results. For now x86 host AltMathLibrary doesn't have such ability. For such accuracy level, the fpbuiltins should be replaced by equivalent IR operation or llvmbuiltins.
1 parent a586269 commit 193b4bc

File tree

4 files changed

+289
-8
lines changed

4 files changed

+289
-8
lines changed

clang/test/CodeGen/fp-accuracy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ void f1(float a, float b) {
475475
// CHECK-SPIR: attributes #[[ATTR_SYCL3]] = {{.*}}"fpbuiltin-max-error"="6.0"
476476
// CHECK-SPIR: attributes #[[ATTR_SYCL4]] = {{.*}}"fpbuiltin-max-error"="16.0"
477477
// CHECK-SPIR: attributes #[[ATTR_SYCL5]] = {{.*}}"fpbuiltin-max-error"="3.0"
478-
// CHECK-SPIR: attributes #[[ATTR_SYCL6]] = {{.*}}"fpbuiltin-max-error"="0.0"
478+
// CHECK-SPIR: attributes #[[ATTR_SYCL6]] = {{.*}}"fpbuiltin-max-error"="0.5"
479479
// CHECK-SPIR: attributes #[[ATTR_SYCL7]] = {{.*}}"fpbuiltin-max-error"="2.5"
480480
// CHECK-SPIR: attributes #[[ATTR_SYCL8]] = {{.*}}"fpbuiltin-max-error"="2.0"
481481

llvm/include/llvm/IR/FPAccuracy.def

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@
2727
//
2828
// FP_ACCURACY(<IID>, <SYCL_FLOAT>, <SYCL_DOUBLE>, <CUDA_FLOAT>, <CUDA_DOUBLE>)
2929
//
30-
FP_ACCURACY(fpbuiltin_fadd, "0.0", "0.0", "0.0", "0.0")
31-
FP_ACCURACY(fpbuiltin_fsub, "0.0", "0.0", "0.0", "0.0")
32-
FP_ACCURACY(fpbuiltin_fmul, "0.0", "0.0", "0.0", "0.0")
33-
FP_ACCURACY(fpbuiltin_fdiv, "2.5", "0.0", "2.0", "0.0")
34-
FP_ACCURACY(fpbuiltin_frem, "0.0", "0.0", "0.0", "0.0")
30+
FP_ACCURACY(fpbuiltin_fadd, "0.5", "0.5", "0.5", "0.5")
31+
FP_ACCURACY(fpbuiltin_fsub, "0.5", "0.5", "0.5", "0.5")
32+
FP_ACCURACY(fpbuiltin_fmul, "0.5", "0.5", "0.5", "0.5")
33+
FP_ACCURACY(fpbuiltin_fdiv, "2.5", "0.5", "2.0", "0.5")
34+
FP_ACCURACY(fpbuiltin_frem, "0.5", "0.5", "0.5", "0.5")
3535
FP_ACCURACY(fpbuiltin_sin, "4.0", "4.0", "2.0", "2.0")
3636
FP_ACCURACY(fpbuiltin_cos, "4.0", "4.0", "2.0", "2.0")
3737
FP_ACCURACY(fpbuiltin_tan, "5.0", "5.0", "4.0", "2.0")
@@ -55,8 +55,8 @@ FP_ACCURACY(fpbuiltin_log10, "3.0", "3.0", "2.0", "1.0")
5555
FP_ACCURACY(fpbuiltin_log1p, "2.0", "2.0", "1.0", "1.0")
5656
FP_ACCURACY(fpbuiltin_hypot, "4.0", "4.0", "3.0", "2.0")
5757
FP_ACCURACY(fpbuiltin_pow, "16.0", "16.0", "8.0", "2.0")
58-
FP_ACCURACY(fpbuiltin_ldexp, "0.0", "0.0", "0.0", "0.0")
59-
FP_ACCURACY(fpbuiltin_sqrt, "2.5", "0.0", "2.0", "0.0")
58+
FP_ACCURACY(fpbuiltin_ldexp, "0.5", "0.5", "0.5", "0.5")
59+
FP_ACCURACY(fpbuiltin_sqrt, "2.5", "0.5", "2.0", "0.5")
6060
FP_ACCURACY(fpbuiltin_rsqrt, "2.0", "2.0", "2.0", "1.0")
6161
FP_ACCURACY(fpbuiltin_erf, "16.0", "16.0", "2.0", "2.0")
6262
FP_ACCURACY(fpbuiltin_erfc, "16.0", "16.0", "4.0", "5.0")

llvm/lib/CodeGen/FPBuiltinFnSelection.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,48 @@ static bool replaceWithAltMathFunction(FPBuiltinIntrinsic &BuiltinCall,
6363
return true;
6464
}
6565

66+
static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
67+
// Replace the call to the fpbuiltin intrinsic with a call
68+
// to the corresponding function from the alternate math library.
69+
IRBuilder<> IRBuilder(&BuiltinCall);
70+
SmallVector<Value *> Args(BuiltinCall.args());
71+
// Preserve the operand bundles.
72+
Value *Replacement = nullptr;
73+
switch (BuiltinCall.getIntrinsicID()) {
74+
default:
75+
llvm_unreachable("Unexpected instrinsic");
76+
case Intrinsic::fpbuiltin_fadd:
77+
Replacement = IRBuilder.CreateFAdd(Args[0], Args[1]);
78+
break;
79+
case Intrinsic::fpbuiltin_fsub:
80+
Replacement = IRBuilder.CreateFSub(Args[0], Args[1]);
81+
break;
82+
case Intrinsic::fpbuiltin_fmul:
83+
Replacement = IRBuilder.CreateFMul(Args[0], Args[1]);
84+
break;
85+
case Intrinsic::fpbuiltin_fdiv:
86+
Replacement = IRBuilder.CreateFDiv(Args[0], Args[1]);
87+
break;
88+
case Intrinsic::fpbuiltin_frem:
89+
Replacement = IRBuilder.CreateFRem(Args[0], Args[1]);
90+
break;
91+
case Intrinsic::fpbuiltin_sqrt:
92+
Replacement =
93+
IRBuilder.CreateIntrinsic(BuiltinCall.getType(), Intrinsic::sqrt, Args);
94+
break;
95+
case Intrinsic::fpbuiltin_ldexp:
96+
Replacement = IRBuilder.CreateIntrinsic(BuiltinCall.getType(),
97+
Intrinsic::ldexp, Args);
98+
break;
99+
}
100+
BuiltinCall.replaceAllUsesWith(Replacement);
101+
cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
102+
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
103+
<< BuiltinCall.getCalledFunction()->getName()
104+
<< "` with equivalent IR. \n `");
105+
return true;
106+
}
107+
66108
static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
67109
FPBuiltinIntrinsic &BuiltinCall) {
68110
LLVM_DEBUG({
@@ -83,6 +125,26 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
83125
return false;
84126
}
85127

128+
Triple T(BuiltinCall.getModule()->getTargetTriple());
129+
// Several functions for "sycl" and "cuda" requires "0.5" accuracy levels,
130+
// which means correctly rounded results. For now x86 host AltMathLibrary
131+
// doesn't have such ability. For such accuracy level, the fpbuiltins
132+
// should be replaced by equivalent IR operation or llvmbuiltins.
133+
if (T.isX86() && BuiltinCall.getRequiredAccuracy().value() == 0.5) {
134+
switch (BuiltinCall.getIntrinsicID()) {
135+
case Intrinsic::fpbuiltin_fadd:
136+
case Intrinsic::fpbuiltin_fsub:
137+
case Intrinsic::fpbuiltin_fmul:
138+
case Intrinsic::fpbuiltin_fdiv:
139+
case Intrinsic::fpbuiltin_frem:
140+
case Intrinsic::fpbuiltin_sqrt:
141+
case Intrinsic::fpbuiltin_ldexp:
142+
return replaceWithLLVMIR(BuiltinCall);
143+
default:
144+
report_fatal_error("Unexpected fpbuiltin requiring 0.5 max error.");
145+
}
146+
}
147+
86148
/// Call TLI to select a function implementation to call
87149
StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall);
88150
if (ImplName.empty()) {
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
; RUN: opt -alt-math-library=svml -fpbuiltin-fn-selection -S < %s | FileCheck %s
2+
3+
; Several functions for "sycl" and "cuda" requires "0.5" accuracy levels,
4+
; Test if these fpbuiltins could be replaced by equivalaent IR operations
5+
; or llvm builtins.
6+
7+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
8+
target triple = "x86_64-unknown-linux-gnu"
9+
10+
; CHECK-LABEL: @svml_fadd
11+
; CHECK: %0 = fadd fast float %f1, %f2
12+
; CHECK: %1 = fadd fast <4 x float> %v4f1, %v4f2
13+
; CHECK: %2 = fadd fast <8 x float> %v8f1, %v8f2
14+
; CHECK: %3 = fadd fast <16 x float> %v16f1, %v16f2
15+
; CHECK: %4 = fadd fast double %d1, %d2
16+
; CHECK: %5 = fadd fast <2 x double> %v2d1, %v2d2
17+
; CHECK: %6 = fadd fast <4 x double> %v4d1, %v4d2
18+
; CHECK: %7 = fadd fast <8 x double> %v8d1, %v8d2
19+
define void @svml_fadd(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
20+
float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
21+
double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
22+
double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
23+
entry:
24+
%t0_0 = call fast float @llvm.fpbuiltin.fadd.f32(float %f1, float %f2) #0
25+
%t1_0 = call fast <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
26+
%t2_0 = call fast <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
27+
%t3_0 = call fast <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
28+
%t4_0 = call fast double @llvm.fpbuiltin.fadd.f64(double %d1, double %d2) #0
29+
%t5_0 = call fast <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
30+
%t6_0 = call fast <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
31+
%t7_0 = call fast <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
32+
ret void
33+
}
34+
35+
declare float @llvm.fpbuiltin.fadd.f32(float, float)
36+
declare <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float>, <4 x float>)
37+
declare <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float>, <8 x float>)
38+
declare <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float>, <16 x float>)
39+
declare double @llvm.fpbuiltin.fadd.f64(double, double)
40+
declare <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double>, <2 x double>)
41+
declare <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double>, <4 x double>)
42+
declare <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double>, <8 x double>)
43+
44+
; CHECK-LABEL: @svml_fsub
45+
; CHECK: %0 = fsub fast float %f1, %f2
46+
; CHECK: %1 = fsub fast <4 x float> %v4f1, %v4f2
47+
; CHECK: %2 = fsub fast <8 x float> %v8f1, %v8f2
48+
; CHECK: %3 = fsub fast <16 x float> %v16f1, %v16f2
49+
; CHECK: %4 = fsub fast double %d1, %d2
50+
; CHECK: %5 = fsub fast <2 x double> %v2d1, %v2d2
51+
; CHECK: %6 = fsub fast <4 x double> %v4d1, %v4d2
52+
; CHECK: %7 = fsub fast <8 x double> %v8d1, %v8d2
53+
define void @svml_fsub(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
54+
float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
55+
double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
56+
double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
57+
entry:
58+
%t0_0 = call fast float @llvm.fpbuiltin.fsub.f32(float %f1, float %f2) #0
59+
%t1_0 = call fast <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
60+
%t2_0 = call fast <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
61+
%t3_0 = call fast <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
62+
%t4_0 = call fast double @llvm.fpbuiltin.fsub.f64(double %d1, double %d2) #0
63+
%t5_0 = call fast <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
64+
%t6_0 = call fast <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
65+
%t7_0 = call fast <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
66+
ret void
67+
}
68+
69+
declare float @llvm.fpbuiltin.fsub.f32(float, float)
70+
declare <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float>, <4 x float>)
71+
declare <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float>, <8 x float>)
72+
declare <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float>, <16 x float>)
73+
declare double @llvm.fpbuiltin.fsub.f64(double, double)
74+
declare <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double>, <2 x double>)
75+
declare <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double>, <4 x double>)
76+
declare <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double>, <8 x double>)
77+
78+
; CHECK-LABEL: @svml_fmul
79+
; CHECK: %0 = fmul fast float %f1, %f2
80+
; CHECK: %1 = fmul fast <4 x float> %v4f1, %v4f2
81+
; CHECK: %2 = fmul fast <8 x float> %v8f1, %v8f2
82+
; CHECK: %3 = fmul fast <16 x float> %v16f1, %v16f2
83+
; CHECK: %4 = fmul fast double %d1, %d2
84+
; CHECK: %5 = fmul fast <2 x double> %v2d1, %v2d2
85+
; CHECK: %6 = fmul fast <4 x double> %v4d1, %v4d2
86+
; CHECK: %7 = fmul fast <8 x double> %v8d1, %v8d2
87+
define void @svml_fmul(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
88+
float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
89+
double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
90+
double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
91+
entry:
92+
%t0_0 = call fast float @llvm.fpbuiltin.fmul.f32(float %f1, float %f2) #0
93+
%t1_0 = call fast <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
94+
%t2_0 = call fast <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
95+
%t3_0 = call fast <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
96+
%t4_0 = call fast double @llvm.fpbuiltin.fmul.f64(double %d1, double %d2) #0
97+
%t5_0 = call fast <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
98+
%t6_0 = call fast <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
99+
%t7_0 = call fast <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
100+
ret void
101+
}
102+
103+
declare float @llvm.fpbuiltin.fmul.f32(float, float)
104+
declare <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float>, <4 x float>)
105+
declare <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float>, <8 x float>)
106+
declare <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float>, <16 x float>)
107+
declare double @llvm.fpbuiltin.fmul.f64(double, double)
108+
declare <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double>, <2 x double>)
109+
declare <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double>, <4 x double>)
110+
declare <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double>, <8 x double>)
111+
112+
; CHECK-LABEL: @svml_fdiv
113+
; CHECK: %0 = fdiv fast double %d1, %d2
114+
; CHECK: %1 = fdiv fast <2 x double> %v2d1, %v2d2
115+
; CHECK: %2 = fdiv fast <4 x double> %v4d1, %v4d2
116+
; CHECK: %3 = fdiv fast <8 x double> %v8d1, %v8d2
117+
define void @svml_fdiv(double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
118+
double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
119+
entry:
120+
%t0_0 = call fast double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0
121+
%t1_0 = call fast <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
122+
%t2_0 = call fast <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
123+
%t3_0 = call fast <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
124+
ret void
125+
}
126+
127+
declare double @llvm.fpbuiltin.fdiv.f64(double, double)
128+
declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)
129+
declare <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double>, <4 x double>)
130+
declare <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double>, <8 x double>)
131+
132+
; CHECK-LABEL: @svml_frem
133+
; CHECK: %0 = frem fast float %f1, %f2
134+
; CHECK: %1 = frem fast <4 x float> %v4f1, %v4f2
135+
; CHECK: %2 = frem fast <8 x float> %v8f1, %v8f2
136+
; CHECK: %3 = frem fast <16 x float> %v16f1, %v16f2
137+
; CHECK: %4 = frem fast double %d1, %d2
138+
; CHECK: %5 = frem fast <2 x double> %v2d1, %v2d2
139+
; CHECK: %6 = frem fast <4 x double> %v4d1, %v4d2
140+
; CHECK: %7 = frem fast <8 x double> %v8d1, %v8d2
141+
define void @svml_frem(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
142+
float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2,
143+
double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
144+
double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) {
145+
entry:
146+
%t0_0 = call fast float @llvm.fpbuiltin.frem.f32(float %f1, float %f2) #0
147+
%t1_0 = call fast <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0
148+
%t2_0 = call fast <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0
149+
%t3_0 = call fast <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0
150+
%t4_0 = call fast double @llvm.fpbuiltin.frem.f64(double %d1, double %d2) #0
151+
%t5_0 = call fast <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
152+
%t6_0 = call fast <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0
153+
%t7_0 = call fast <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0
154+
ret void
155+
}
156+
157+
declare float @llvm.fpbuiltin.frem.f32(float, float)
158+
declare <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float>, <4 x float>)
159+
declare <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float>, <8 x float>)
160+
declare <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float>, <16 x float>)
161+
declare double @llvm.fpbuiltin.frem.f64(double, double)
162+
declare <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double>, <2 x double>)
163+
declare <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double>, <4 x double>)
164+
declare <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double>, <8 x double>)
165+
166+
; CHECK-LABEL: @svml_sqrt
167+
; CHECK: %0 = call double @llvm.sqrt.f64(double %d)
168+
; CHECK: %1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %v2d)
169+
; CHECK: %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %v4d)
170+
; CHECK: %3 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %v8d)
171+
define void @svml_sqrt(double %d, <2 x double> %v2d, <4 x double> %v4d, <8 x double> %v8d) {
172+
entry:
173+
%t4_0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
174+
%t5_0 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
175+
%t6_0 = call <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double> %v4d) #0
176+
%t7_0 = call <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double> %v8d) #0
177+
ret void
178+
}
179+
180+
declare double @llvm.fpbuiltin.sqrt.f64(double)
181+
declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
182+
declare <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double>)
183+
declare <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double>)
184+
185+
; CHECK-LABEL: @svml_ldexp
186+
; CHECK: %0 = call fast float @llvm.ldexp.f32.i32(float %f1, i32 %f2)
187+
; CHECK: %1 = call fast <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2)
188+
; CHECK: %2 = call fast <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2)
189+
; CHECK: %3 = call fast <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2)
190+
; CHECK: %4 = call fast double @llvm.ldexp.f64.i32(double %d1, i32 %d2)
191+
; CHECK: %5 = call fast <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2)
192+
; CHECK: %6 = call fast <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2)
193+
; CHECK: %7 = call fast <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2)
194+
define void @svml_ldexp(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1,
195+
i32 %f2, <4 x i32> %v4f2, <8 x i32> %v8f2, <16 x i32> %v16f2,
196+
double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1,
197+
i32 %d2, <2 x i32> %v2d2, <4 x i32> %v4d2, <8 x i32> %v8d2) {
198+
entry:
199+
%t0_0 = call fast float @llvm.fpbuiltin.ldexp.f32.i32(float %f1, i32 %f2) #0
200+
%t1_0 = call fast <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) #0
201+
%t2_0 = call fast <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) #0
202+
%t3_0 = call fast <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) #0
203+
%t4_0 = call fast double @llvm.fpbuiltin.ldexp.f64.i32(double %d1, i32 %d2) #0
204+
%t5_0 = call fast <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) #0
205+
%t6_0 = call fast <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) #0
206+
%t7_0 = call fast <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) #0
207+
ret void
208+
}
209+
210+
declare float @llvm.fpbuiltin.ldexp.f32.i32(float, i32)
211+
declare <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
212+
declare <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>)
213+
declare <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>)
214+
declare double @llvm.fpbuiltin.ldexp.f64.i32(double, i32)
215+
declare <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
216+
declare <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
217+
declare <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
218+
219+
attributes #0 = { "fpbuiltin-max-error"="0.5" }

0 commit comments

Comments
 (0)