-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Adding more vector calls for -fveclib=AMDLIBM #109662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Rohit Aggarwal (rohitaggarwal007) ChangesHi, AMD has it's own implementation of vector calls. New vector calls are introduced in the library. Patch is 32.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109662.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 532a3ca334b1ae..06024131c5bdac 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1317,14 +1317,17 @@ TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLV
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("log10", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("log10f", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__log10_finite", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1339,6 +1342,12 @@ TLI_DEFINE_VECFUNC("erf", "amd_vrd8_erf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("exp10", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("exp10f", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__exp10_finite", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp10f_finite", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
TLI_DEFINE_VECFUNC("expm1", "amd_vrd2_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("expm1f", "amd_vrs4_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1369,10 +1378,19 @@ TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LL
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__asin_finite", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("acosf", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1410,6 +1428,12 @@ TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_
TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd4_sincos", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl8l8")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd8_sincos", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl8l8")
+
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs4_sincosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs8_sincosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs16_sincosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vl4l4")
#else
#error "Must choose which vector library functions are to be defined."
#endif
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
index 54bb9352f3c89c..9899eded738086 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
@@ -7,12 +7,10 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-declare float @__expf_finite(float) #0
-
+define void @exp_f32(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f32
; CHECK: <4 x float> @amd_vrs4_expf
; CHECK: ret
-define void @exp_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -25,23 +23,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__exp_finite(double) #0
-
+define void @exp_f64(ptr nocapture %varray) {
; CHECK-LABEL: @exp_f64
; CHECK: <4 x double> @amd_vrd4_exp
; CHECK: ret
-define void @exp_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -54,25 +45,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!11 = distinct !{!11, !12, !13}
-!12 = !{!"llvm.loop.vectorize.width", i32 4}
-!13 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-
-
-declare float @__logf_finite(float) #0
-
+define void @log_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log_f32
; CHECK: <4 x float> @amd_vrs4_logf
; CHECK: ret
-define void @log_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -85,23 +67,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!21 = distinct !{!21, !22, !23}
-!22 = !{!"llvm.loop.vectorize.width", i32 4}
-!23 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log_finite(double) #0
-
+define void @log_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log_f64
; CHECK: <4 x double> @amd_vrd4_log
; CHECK: ret
-define void @log_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -114,23 +89,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!31 = distinct !{!31, !32, !33}
-!32 = !{!"llvm.loop.vectorize.width", i32 4}
-!33 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare float @__powf_finite(float, float) #0
-
+define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32
; CHECK: <4 x float> @amd_vrs4_powf
; CHECK: ret
-define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -145,23 +113,16 @@ for.body: ; preds = %for.body, %entry
store float %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!41 = distinct !{!41, !42, !43}
-!42 = !{!"llvm.loop.vectorize.width", i32 4}
-!43 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__pow_finite(double, double) #0
-
+define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64
; CHECK: <4 x double> @amd_vrd4_pow
; CHECK: ret
-define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
entry:
br label %for.body
@@ -176,18 +137,12 @@ for.body: ; preds = %for.body, %entry
store double %tmp2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!51 = distinct !{!51, !52, !53}
-!52 = !{!"llvm.loop.vectorize.width", i32 4}
-!53 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__exp2f_finite(float) #0
-
define void @exp2f_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2f_finite(
; CHECK: call <4 x float> @amd_vrs4_exp2f(<4 x float> %{{.*}})
@@ -205,18 +160,12 @@ for.body:
store float %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !61
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!61 = distinct !{!61, !62, !63}
-!62 = !{!"llvm.loop.vectorize.width", i32 4}
-!63 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare double @__exp2_finite(double) #0
-
define void @exp2_finite(ptr nocapture %varray) {
; CHECK-LABEL: @exp2_finite(
; CHECK: call <4 x double> @amd_vrd4_exp2(<4 x double> {{.*}})
@@ -234,22 +183,16 @@ for.body:
store double %call, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !71
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
-!71 = distinct !{!71, !72, !73}
-!72 = !{!"llvm.loop.vectorize.width", i32 4}
-!73 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log2f_finite(float) #0
-
+define void @log2_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f32
; CHECK: <4 x float> @amd_vrs4_log2f
; CHECK: ret
-define void @log2_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -262,23 +205,16 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!81 = distinct !{!21, !22, !23}
-!82 = !{!"llvm.loop.vectorize.width", i32 4}
-!83 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log2_finite(double) #0
-
+define void @log2_f64(ptr nocapture %varray) {
; CHECK-LABEL: @log2_f64
; CHECK: <4 x double> @amd_vrd4_log2
; CHECK: ret
-define void @log2_f64(ptr nocapture %varray) {
entry:
br label %for.body
@@ -291,22 +227,16 @@ for.body: ; preds = %for.body, %entry
store double %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!91 = distinct !{!31, !32, !33}
-!92 = !{!"llvm.loop.vectorize.width", i32 4}
-!93 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log10f_finite(float) #0
-
+define void @log10_f32(ptr nocapture %varray) {
; CHECK-LABEL: @log10_f32
; CHECK: <4 x float> @amd_vrs4_log10f
; CHECK: ret
-define void @log10_f32(ptr nocapture %varray) {
entry:
br label %for.body
@@ -319,14 +249,173 @@ for.body: ; preds = %for.body, %entry
store float %call, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @log10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_finite(
+; CHECK: call <2 x double> @amd_vrd2_log10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__log10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_finite(
+; CHECK: call <2 x double> @amd_vrd2_exp10(<2 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__exp10_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32
+; CHECK: <4 x float> @amd_vrs4_exp10f
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__exp10f_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end: ; preds = %for.body
ret void
}
-!101 = distinct !{!21, !22, !23}
-!102 = !{!"llvm.loop.vectorize.width", i32 4}
-!103 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @asin_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_finite(
+; CHECK: call <8 x double> @amd_vrd8_asin(<8 x double> {{.*}})
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @__asin_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+ store double %call, ptr %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:
+ ret void
+}
+define void @asinf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asinf_finite
+; CHECK: <4 x float> @amd_vrs4_asinf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__asinf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @acosf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @acosf_finite
+; CHECK: <4 x float> @amd_vrs4_acosf
+; CHECK: ret
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__acosf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+ store float %call, ptr %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!7 = distinct !{!7, !8, !9}
+!8 = !{!"llvm.loop.vectorize.width", i32 8}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+declare float @__expf_finite(float) #0
+declare double @__exp_finite(double) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare float @__powf_finite(float, float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__log2f_finite(float) #0
+declare double @__log2_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log10_finite(double) #0
+declare double @__exp10_finite(double) #0
+declare float @__exp10f_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare float @__acosf_finite(float) #0
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 4acc7fe7eaccf6..f6c91de15279e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1444,6 +1444,59 @@ for.end:
ret void
}
+define void @log10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; C...
[truncated]
|
@florianhumblot @alexey-bataev bataev @RKSimon @phoebewang @farzonl |
define void @exp10_f64(ptr nocapture %varray) { | ||
; CHECK-LABEL: @exp10_f64( | ||
; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]]) | ||
; CHECK-VF4-NOT: call <4 x double> @amd_vrd4_exp10(<4 x double> [[TMP4:%.*]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check for @llvm.exp10.v4f64 instead (etc.)? That's what we do in the rest of the file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@RKSimon Please check the above comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we just missing exp10 handling in isTriviallyVectorizable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, case for exp10 is not handled in bool llvm::isTriviallyVectorizable(Intrinsic::ID ID)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any luck with adding exp10 to isTriviallyVectorizable?
|
||
define void @exp10_f64_intrinsic(ptr nocapture %varray) { | ||
; CHECK-LABEL: @exp10_f64_intrinsic( | ||
; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing VF2 entries (below)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not fully understood the comment. VF2 is present. Which entries you are mentioning here, please point out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I added the comment to the wrong test - I meant that every test should have VF2/4/8/16 test checks
define void @exp10_f32(ptr nocapture %varray) { | ||
; CHECK-LABEL: @exp10_f32( | ||
; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]]) | ||
; CHECK-VF8-NOT: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp10f(<8 x float> [[TMP4:%.*]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its OK if you want to keep the CHECK-VF*-NOT line as well, but I'd expect the CHECK-VF* line (llvm.exp10.v8f32?) etc. as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will start of add the support of exp10 in isTriviallyVectorizable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now exp10 is generated as llvm.exp10.v8f32 for VF=8 and respectively
ret void | ||
} | ||
|
||
|
||
define void @log10_f32(ptr nocapture %varray) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(pedantic) we're typically ordering the tests as f64/f32/f64_intrinsic/f32_intrinsic sets
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, It's help to cover up the cases. Is their some other convention that should be followed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mainly just consistency in grouping/ordering/naming to make it easier for people to find related tests
Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5
…ct into amdlibm_fork Change-Id: Icbb57343057fafa4ecbb4c92c226d714660e4407
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@nikic Please review the pending changes |
Gentle Reminder... |
@rohitaggarwal007 Please can you rebase and then I will commit this |
Sure, Thanks |
Change-Id: Id610cc3ec94ae1efaba511869ad4437d9a95b23e
…ct into amdlibm_fork Change-Id: I8dba5d48e1c0e58903c6c64e7fc0af0b62eba7c9
@RKSimon , I have rebase the code. |
Thanks @RKSimon |
AMD has it's own implementation of vector calls. New vector calls are introduced in the library for exp10, log10, sincos and finite asin/acos Please refer [https://github.com/amd/aocl-libm-ose] --------- Co-authored-by: Rohit Aggarwal <[email protected]>
Hi,
AMD has it's own implementation of vector calls. New vector calls are introduced in the library.
Please refer [https://github.com/amd/aocl-libm-ose]