swiftlang
diff --git a/‎clang/include/clang/Basic/BuiltinsVEVL.gen.def
Lines changed: 622 additions & 0 deletions b/‎clang/include/clang/Basic/BuiltinsVEVL.gen.def
Lines changed: 622 additions & 0 deletions
diff --git a/‎clang/lib/Headers/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎clang/lib/Headers/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/lib/Headers/velintrin.h
Lines changed: 36 additions & 0 deletions b/‎clang/lib/Headers/velintrin.h
Lines changed: 36 additions & 0 deletions
diff --git a/‎clang/lib/Headers/velintrin_approx.h
Lines changed: 120 additions & 0 deletions b/‎clang/lib/Headers/velintrin_approx.h
Lines changed: 120 additions & 0 deletions
@@ -147,6 +147,7 @@ set(files
   xtestintrin.h
   velintrin.h
   velintrin_gen.h
+  velintrin_approx.h
   )
 
 set(cuda_wrapper_files
 
@@ -32,7 +32,43 @@ typedef bool __vm512 __attribute__((ext_vector_type(512)));
 #endif
 #endif
 
+enum VShuffleCodes {
+  VE_VSHUFFLE_YUYU = 0,
+  VE_VSHUFFLE_YUYL = 1,
+  VE_VSHUFFLE_YUZU = 2,
+  VE_VSHUFFLE_YUZL = 3,
+  VE_VSHUFFLE_YLYU = 4,
+  VE_VSHUFFLE_YLYL = 5,
+  VE_VSHUFFLE_YLZU = 6,
+  VE_VSHUFFLE_YLZL = 7,
+  VE_VSHUFFLE_ZUYU = 8,
+  VE_VSHUFFLE_ZUYL = 9,
+  VE_VSHUFFLE_ZUZU = 10,
+  VE_VSHUFFLE_ZUZL = 11,
+  VE_VSHUFFLE_ZLYU = 12,
+  VE_VSHUFFLE_ZLYL = 13,
+  VE_VSHUFFLE_ZLZU = 14,
+  VE_VSHUFFLE_ZLZL = 15,
+};
+
 // Use generated intrinsic name definitions
 #include <velintrin_gen.h>
 
+// Use helper functions
+#include <velintrin_approx.h>
+
+// pack
+
+#define _vel_pack_f32p __builtin_ve_vl_pack_f32p
+#define _vel_pack_f32a __builtin_ve_vl_pack_f32a
+
+static inline unsigned long int _vel_pack_i32(unsigned int a, unsigned int b) {
+  return (((unsigned long int)a) << 32) | b;
+}
+
+#define _vel_extract_vm512u(vm) __builtin_ve_vl_extract_vm512u(vm)
+#define _vel_extract_vm512l(vm) __builtin_ve_vl_extract_vm512l(vm)
+#define _vel_insert_vm512u(vm512, vm) __builtin_ve_vl_insert_vm512u(vm512, vm)
+#define _vel_insert_vm512l(vm512, vm) __builtin_ve_vl_insert_vm512l(vm512, vm)
+
 #endif
@@ -0,0 +1,120 @@
+/*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __VEL_INTRIN_APPROX_H__
+#define __VEL_INTRIN_APPROX_H__
+
+static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
+  float s0;
+  __vr v2, v3, v4, v5;
+  v5 = _vel_vrcps_vvl(v1, l);
+  s0 = 1.0;
+  v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
+  v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
+  v2 = _vel_vfmuls_vvvl(v0, v3, l);
+  v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
+  v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
+  v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
+  v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
+  float s0;
+  __vr v2, v3, v4, v5;
+  v5 = _vel_pvrcp_vvl(v1, l);
+  s0 = 1.0;
+  v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
+  v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
+  v2 = _vel_pvfmul_vvvl(v0, v3, l);
+  v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
+  v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
+  v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
+  v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
+  float s1;
+  __vr v1, v2, v3, v4;
+  v4 = _vel_vrcps_vvl(v0, l);
+  s1 = 1.0;
+  v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
+  v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
+  v1 = _vel_vfmuls_vsvl(s0, v2, l);
+  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
+  v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
+  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
+  v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
+  float s1;
+  __vr v1, v2;
+  s1 = 1.0f / s0;
+  v1 = _vel_vfmuls_vsvl(s1, v0, l);
+  v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
+  v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
+  __vr v1, v2, v3;
+  v2 = _vel_vrcpd_vvl(v0, l);
+  double s1 = 1.0;
+  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
+  v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
+  v1 = _vel_vaddul_vsvl(1, v1, l);
+  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
+  v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  v1 = _vel_vfmuld_vsvl(s0, v3, l);
+  v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
+  double s0, s1;
+  __vr v1, v2, v3;
+  v2 = _vel_vrsqrtdnex_vvl(v0, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  s0 = 1.0;
+  s1 = 0.5;
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
+  float s0, s1;
+  __vr v1, v2, v3;
+  v0 = _vel_vcvtds_vvl(v0, l);
+  v2 = _vel_vrsqrtdnex_vvl(v0, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  s0 = 1.0;
+  s1 = 0.5;
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  v0 = _vel_vcvtsd_vvl(v0, l);
+  return v0;
+}
+
+#endif
Original file line number	Diff line number	Diff line change
`@@ -147,6 +147,7 @@ set(files`
`147`	`147`	`xtestintrin.h`
`148`	`148`	`velintrin.h`
`149`	`149`	`velintrin_gen.h`
	`150`	`+ velintrin_approx.h`
`150`	`151`	`)`
`151`	`152`
`152`	`153`	`set(cuda_wrapper_files`