Adding more vector calls for -fveclib=AMDLIBM #109662

rohitaggarwal007 · 2024-09-23T13:32:59Z

Hi,

AMD has it's own implementation of vector calls. New vector calls are introduced in the library.
Please refer [https://github.com/amd/aocl-libm-ose]

Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5

llvmbot · 2024-09-23T13:33:32Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: Rohit Aggarwal (rohitaggarwal007)

Changes

Hi,

AMD has it's own implementation of vector calls. New vector calls are introduced in the library.
Please refer [https://github.com/amd/aocl-libm-ose]

Patch is 32.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109662.diff

4 Files Affected:

(modified) llvm/include/llvm/Analysis/VecFuncs.def (+24)
(modified) llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll (+182-93)
(modified) llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll (+212)
(modified) llvm/test/Transforms/Util/add-TLI-mappings.ll (+25-2)

diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 532a3ca334b1ae..06024131c5bdac 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1317,14 +1317,17 @@ TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLV
 TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("log10", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("log10f", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 TLI_DEFINE_VECFUNC("log10f", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("log10f", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
+TLI_DEFINE_VECFUNC("__log10_finite", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "amd_vrd2_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1339,6 +1342,12 @@ TLI_DEFINE_VECFUNC("erf", "amd_vrd8_erf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("exp10", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("exp10f", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
+TLI_DEFINE_VECFUNC("__exp10_finite", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("__exp10f_finite", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
 TLI_DEFINE_VECFUNC("expm1", "amd_vrd2_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("expm1f", "amd_vrs4_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
@@ -1369,10 +1378,19 @@ TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LL
 TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("__asin_finite", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__asinf_finite", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("__acosf_finite", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs16_acosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
@@ -1410,6 +1428,12 @@ TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_
 TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd4_sincos", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl8l8")
+TLI_DEFINE_VECFUNC("sincos", "amd_vrd8_sincos", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl8l8")
+
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs4_sincosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs8_sincosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vl4l4")
+TLI_DEFINE_VECFUNC("sincosf", "amd_vrs16_sincosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vl4l4")
 #else
 #error "Must choose which vector library functions are to be defined."
 #endif
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
index 54bb9352f3c89c..9899eded738086 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll
@@ -7,12 +7,10 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare float @__expf_finite(float) #0
-
+define void @exp_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp_f32
 ; CHECK: <4 x float> @amd_vrs4_expf
 ; CHECK: ret
-define void @exp_f32(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -25,23 +23,16 @@ for.body:                                         ; preds = %for.body, %entry
   store float %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__exp_finite(double) #0
-
+define void @exp_f64(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp_f64
 ; CHECK: <4 x double> @amd_vrd4_exp
 ; CHECK: ret
-define void @exp_f64(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -54,25 +45,16 @@ for.body:                                         ; preds = %for.body, %entry
   store double %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!11 = distinct !{!11, !12, !13}
-!12 = !{!"llvm.loop.vectorize.width", i32 4}
-!13 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-
-
-declare float @__logf_finite(float) #0
-
+define void @log_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log_f32
 ; CHECK: <4 x float> @amd_vrs4_logf
 ; CHECK: ret
-define void @log_f32(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -85,23 +67,16 @@ for.body:                                         ; preds = %for.body, %entry
   store float %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!21 = distinct !{!21, !22, !23}
-!22 = !{!"llvm.loop.vectorize.width", i32 4}
-!23 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log_finite(double) #0
-
+define void @log_f64(ptr nocapture %varray) {
 ; CHECK-LABEL: @log_f64
 ; CHECK: <4 x double> @amd_vrd4_log
 ; CHECK: ret
-define void @log_f64(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -114,23 +89,16 @@ for.body:                                         ; preds = %for.body, %entry
   store double %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!31 = distinct !{!31, !32, !33}
-!32 = !{!"llvm.loop.vectorize.width", i32 4}
-!33 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare float @__powf_finite(float, float) #0
-
+define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32
 ; CHECK: <4 x float> @amd_vrs4_powf
 ; CHECK: ret
-define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) {
 entry:
   br label %for.body
 
@@ -145,23 +113,16 @@ for.body:                                         ; preds = %for.body, %entry
   store float %tmp2, ptr %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!41 = distinct !{!41, !42, !43}
-!42 = !{!"llvm.loop.vectorize.width", i32 4}
-!43 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__pow_finite(double, double) #0
-
+define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64
 ; CHECK: <4 x double> @amd_vrd4_pow
 ; CHECK: ret
-define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 entry:
   br label %for.body
 
@@ -176,18 +137,12 @@ for.body:                                         ; preds = %for.body, %entry
   store double %tmp2, ptr %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!51 = distinct !{!51, !52, !53}
-!52 = !{!"llvm.loop.vectorize.width", i32 4}
-!53 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__exp2f_finite(float) #0
-
 define void @exp2f_finite(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp2f_finite(
 ; CHECK:    call <4 x float> @amd_vrs4_exp2f(<4 x float> %{{.*}})
@@ -205,18 +160,12 @@ for.body:
   store float %call, ptr %arrayidx, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !61
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:
   ret void
 }
 
-!61 = distinct !{!61, !62, !63}
-!62 = !{!"llvm.loop.vectorize.width", i32 4}
-!63 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare double @__exp2_finite(double) #0
-
 define void @exp2_finite(ptr nocapture %varray) {
 ; CHECK-LABEL: @exp2_finite(
 ; CHECK:    call <4 x double> @amd_vrd4_exp2(<4 x double> {{.*}})
@@ -234,22 +183,16 @@ for.body:
   store double %call, ptr %arrayidx, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !71
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:
   ret void
 }
 
-!71 = distinct !{!71, !72, !73}
-!72 = !{!"llvm.loop.vectorize.width", i32 4}
-!73 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log2f_finite(float) #0
-
+define void @log2_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log2_f32
 ; CHECK: <4 x float> @amd_vrs4_log2f
 ; CHECK: ret
-define void @log2_f32(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -262,23 +205,16 @@ for.body:                                         ; preds = %for.body, %entry
   store float %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!81 = distinct !{!21, !22, !23}
-!82 = !{!"llvm.loop.vectorize.width", i32 4}
-!83 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-
-declare double @__log2_finite(double) #0
-
+define void @log2_f64(ptr nocapture %varray) {
 ; CHECK-LABEL: @log2_f64
 ; CHECK: <4 x double> @amd_vrd4_log2
 ; CHECK: ret
-define void @log2_f64(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -291,22 +227,16 @@ for.body:                                         ; preds = %for.body, %entry
   store double %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!91 = distinct !{!31, !32, !33}
-!92 = !{!"llvm.loop.vectorize.width", i32 4}
-!93 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-declare float @__log10f_finite(float) #0
-
+define void @log10_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @log10_f32
 ; CHECK: <4 x float> @amd_vrs4_log10f
 ; CHECK: ret
-define void @log10_f32(ptr nocapture %varray) {
 entry:
   br label %for.body
 
@@ -319,14 +249,173 @@ for.body:                                         ; preds = %for.body, %entry
   store float %call, ptr %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @log10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_finite(
+; CHECK:    call <2 x double> @amd_vrd2_log10(<2 x double> {{.*}})
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @__log10_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+define void @exp10_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_finite(
+; CHECK:    call <2 x double> @amd_vrd2_exp10(<2 x double> {{.*}})
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @__exp10_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32
+; CHECK: <4 x float> @amd_vrs4_exp10f
+; CHECK: ret
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__exp10f_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+  store float %call, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
 
 for.end:                                          ; preds = %for.body
   ret void
 }
 
-!101 = distinct !{!21, !22, !23}
-!102 = !{!"llvm.loop.vectorize.width", i32 4}
-!103 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @asin_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_finite(
+; CHECK:    call <8 x double> @amd_vrd8_asin(<8 x double> {{.*}})
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
 
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @__asin_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:
+  ret void
+}
 
+define void @asinf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @asinf_finite
+; CHECK: <4 x float> @amd_vrs4_asinf
+; CHECK: ret
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__asinf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+  store float %call, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @acosf_finite(ptr nocapture %varray) {
+; CHECK-LABEL: @acosf_finite
+; CHECK: <4 x float> @amd_vrs4_acosf
+; CHECK: ret
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__acosf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv
+  store float %call, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!7 = distinct !{!7, !8, !9}
+!8 = !{!"llvm.loop.vectorize.width", i32 8}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+declare float @__expf_finite(float) #0
+declare double @__exp_finite(double) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare float @__powf_finite(float, float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__log2f_finite(float) #0
+declare double @__log2_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log10_finite(double) #0
+declare double @__exp10_finite(double) #0
+declare float @__exp10f_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare float @__acosf_finite(float) #0
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 4acc7fe7eaccf6..f6c91de15279e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1444,6 +1444,59 @@ for.end:
   ret void
 }
 
+define void @log10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_log10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @llvm.log10.v4f64(<4 x double> [[TMP4:%.*]])
+; C...
[truncated]

rohitaggarwal007 · 2024-09-24T07:15:44Z

@florianhumblot @alexey-bataev bataev @RKSimon @phoebewang @farzonl
Please review the pull request

RKSimon · 2024-09-24T07:24:07Z

llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll

+define void @exp10_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4-NOT:    call <4 x double> @amd_vrd4_exp10(<4 x double> [[TMP4:%.*]])


Check for @llvm.exp10.v4f64 instead (etc.)? That's what we do in the rest of the file.

there are no such intrinsic present for exp10. I tried but it is not emitting llvm.exp10.v4f64. Instead, it generating exp10 call.

@RKSimon Please check the above comment

Are we just missing exp10 handling in isTriviallyVectorizable?

Yes, case for exp10 is not handled in bool llvm::isTriviallyVectorizable(Intrinsic::ID ID)

RKSimon

Any luck with adding exp10 to isTriviallyVectorizable?

RKSimon · 2024-10-09T15:39:53Z

llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll

+
+define void @exp10_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f64_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_exp10(<2 x double> [[TMP4:%.*]])


Missing VF2 entries (below)

I do not fully understood the comment. VF2 is present. Which entries you are mentioning here, please point out.

Sorry I added the comment to the wrong test - I meant that every test should have VF2/4/8/16 test checks

RKSimon · 2024-10-09T15:41:47Z

llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll

+define void @exp10_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @exp10_f32(
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp10f(<4 x float> [[TMP4:%.*]])
+; CHECK-VF8-NOT:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_exp10f(<8 x float> [[TMP4:%.*]])


Its OK if you want to keep the CHECK-VF*-NOT line as well, but I'd expect the CHECK-VF* line (llvm.exp10.v8f32?) etc. as well.

I will start of add the support of exp10 in isTriviallyVectorizable

Now exp10 is generated as llvm.exp10.v8f32 for VF=8 and respectively

RKSimon · 2024-10-09T15:42:39Z

llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll

+  ret void
+}
+
+
 define void @log10_f32(ptr nocapture %varray) {


(pedantic) we're typically ordering the tests as f64/f32/f64_intrinsic/f32_intrinsic sets

Yes, It's help to cover up the cases. Is their some other convention that should be followed?

mainly just consistency in grouping/ordering/naming to make it easier for people to find related tests

Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5

…ct into amdlibm_fork Change-Id: Icbb57343057fafa4ecbb4c92c226d714660e4407

RKSimon

LGTM

rohitaggarwal007 · 2024-10-22T10:19:04Z

@nikic Please review the pending changes

rohitaggarwal007 · 2024-10-25T12:05:16Z

@nikic Please review the pending changes

Gentle Reminder...

RKSimon · 2024-10-28T13:39:32Z

@rohitaggarwal007 Please can you rebase and then I will commit this

rohitaggarwal007 · 2024-10-29T04:46:04Z

@rohitaggarwal007 Please can you rebase and then I will commit this

Sure, Thanks

Change-Id: Id610cc3ec94ae1efaba511869ad4437d9a95b23e

…ct into amdlibm_fork Change-Id: I8dba5d48e1c0e58903c6c64e7fc0af0b62eba7c9

rohitaggarwal007 · 2024-10-29T07:35:20Z

@RKSimon , I have rebase the code.

rohitaggarwal007 · 2024-10-29T11:50:00Z

Thanks @RKSimon

AMD has it's own implementation of vector calls. New vector calls are introduced in the library for exp10, log10, sincos and finite asin/acos Please refer [https://github.com/amd/aocl-libm-ose] --------- Co-authored-by: Rohit Aggarwal <[email protected]>

Add more vector calls for -fveclib=AMDLIBM

4e9e39b

Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5

llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Sep 23, 2024

RKSimon self-requested a review September 24, 2024 07:17

RKSimon reviewed Sep 24, 2024

View reviewed changes

RKSimon reviewed Oct 9, 2024

View reviewed changes

Rohit Aggarwal added 2 commits October 17, 2024 23:47

Add more vector calls for -fveclib=AMDLIBM

167616d

Change-Id: Iba8d1ebd0cbad09d9f6ebc4215cc4467239420b5

Merge branch 'amdlibm_fork' of github.com:rohitaggarwal007/llvm-proje…

fe988be

…ct into amdlibm_fork Change-Id: Icbb57343057fafa4ecbb4c92c226d714660e4407

rohitaggarwal007 requested a review from nikic as a code owner October 18, 2024 05:00

RKSimon approved these changes Oct 18, 2024

View reviewed changes

farzonl approved these changes Oct 22, 2024

View reviewed changes

rohitaggarwal007 and others added 6 commits October 29, 2024 10:25

Merge branch 'main' into amdlibm_fork

b41d612

Merge branch 'llvm:main' into amdlibm_fork

b3a8b66

Merge branch 'llvm:main' into amdlibm_fork

618bdc2

Merge branch 'llvm:main' into amdlibm_fork

45ce855

Change-Id: Id610cc3ec94ae1efaba511869ad4437d9a95b23e

Merge branch 'amdlibm_fork' of github.com:rohitaggarwal007/llvm-proje…

3ef52a8

…ct into amdlibm_fork Change-Id: I8dba5d48e1c0e58903c6c64e7fc0af0b62eba7c9

Merge branch 'main' into amdlibm_fork

504b158

RKSimon merged commit dfb60bb into llvm:main Oct 29, 2024
8 checks passed

rohitaggarwal007 deleted the amdlibm_fork branch October 29, 2024 11:49

mr-c mentioned this pull request Nov 22, 2024

fatal error: error in backend: Cannot select: t54: v4f32 = fexp10 t49, ../simde/x86/svml.h:4583:21 @[ ../test/x86/svml.c:15090:21 ] #117200

Open

Adding more vector calls for -fveclib=AMDLIBM #109662

Adding more vector calls for -fveclib=AMDLIBM #109662

Uh oh!

Conversation

rohitaggarwal007 commented Sep 23, 2024

Uh oh!

llvmbot commented Sep 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rohitaggarwal007 commented Sep 24, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rohitaggarwal007 Sep 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

rohitaggarwal007 commented Oct 22, 2024

Uh oh!

rohitaggarwal007 commented Oct 25, 2024

Uh oh!

RKSimon commented Oct 28, 2024

Uh oh!

rohitaggarwal007 commented Oct 29, 2024

Uh oh!

rohitaggarwal007 commented Oct 29, 2024

Uh oh!

Uh oh!

rohitaggarwal007 commented Oct 29, 2024

Uh oh!

Uh oh!

llvmbot commented Sep 23, 2024 •

edited

Loading

rohitaggarwal007 Sep 24, 2024 •

edited

Loading

RKSimon Oct 9, 2024 •

edited

Loading