Skip to content

Commit c2f62ab

Browse files
committed
[Clang][VE] Add the rest of intrinsics to clang
Add the rest of intrinsics to clang except intrinsics using vector mask registers. Reviewed By: simoll Differential Revision: https://reviews.llvm.org/D121586
1 parent 3833b4b commit c2f62ab

File tree

6 files changed

+6362
-1
lines changed

6 files changed

+6362
-1
lines changed

clang/include/clang/Basic/BuiltinsVEVL.gen.def

Lines changed: 622 additions & 0 deletions
Large diffs are not rendered by default.

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ set(files
147147
xtestintrin.h
148148
velintrin.h
149149
velintrin_gen.h
150+
velintrin_approx.h
150151
)
151152

152153
set(cuda_wrapper_files

clang/lib/Headers/velintrin.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,43 @@ typedef bool __vm512 __attribute__((ext_vector_type(512)));
3232
#endif
3333
#endif
3434

35+
enum VShuffleCodes {
36+
VE_VSHUFFLE_YUYU = 0,
37+
VE_VSHUFFLE_YUYL = 1,
38+
VE_VSHUFFLE_YUZU = 2,
39+
VE_VSHUFFLE_YUZL = 3,
40+
VE_VSHUFFLE_YLYU = 4,
41+
VE_VSHUFFLE_YLYL = 5,
42+
VE_VSHUFFLE_YLZU = 6,
43+
VE_VSHUFFLE_YLZL = 7,
44+
VE_VSHUFFLE_ZUYU = 8,
45+
VE_VSHUFFLE_ZUYL = 9,
46+
VE_VSHUFFLE_ZUZU = 10,
47+
VE_VSHUFFLE_ZUZL = 11,
48+
VE_VSHUFFLE_ZLYU = 12,
49+
VE_VSHUFFLE_ZLYL = 13,
50+
VE_VSHUFFLE_ZLZU = 14,
51+
VE_VSHUFFLE_ZLZL = 15,
52+
};
53+
3554
// Use generated intrinsic name definitions
3655
#include <velintrin_gen.h>
3756

57+
// Use helper functions
58+
#include <velintrin_approx.h>
59+
60+
// pack
61+
62+
#define _vel_pack_f32p __builtin_ve_vl_pack_f32p
63+
#define _vel_pack_f32a __builtin_ve_vl_pack_f32a
64+
65+
static inline unsigned long int _vel_pack_i32(unsigned int a, unsigned int b) {
66+
return (((unsigned long int)a) << 32) | b;
67+
}
68+
69+
#define _vel_extract_vm512u(vm) __builtin_ve_vl_extract_vm512u(vm)
70+
#define _vel_extract_vm512l(vm) __builtin_ve_vl_extract_vm512l(vm)
71+
#define _vel_insert_vm512u(vm512, vm) __builtin_ve_vl_insert_vm512u(vm512, vm)
72+
#define _vel_insert_vm512l(vm512, vm) __builtin_ve_vl_insert_vm512l(vm512, vm)
73+
3874
#endif

clang/lib/Headers/velintrin_approx.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
#ifndef __VEL_INTRIN_APPROX_H__
10+
#define __VEL_INTRIN_APPROX_H__
11+
12+
static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
13+
float s0;
14+
__vr v2, v3, v4, v5;
15+
v5 = _vel_vrcps_vvl(v1, l);
16+
s0 = 1.0;
17+
v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
18+
v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
19+
v2 = _vel_vfmuls_vvvl(v0, v3, l);
20+
v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
21+
v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
22+
v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
23+
v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
24+
return v0;
25+
}
26+
27+
static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
28+
float s0;
29+
__vr v2, v3, v4, v5;
30+
v5 = _vel_pvrcp_vvl(v1, l);
31+
s0 = 1.0;
32+
v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
33+
v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
34+
v2 = _vel_pvfmul_vvvl(v0, v3, l);
35+
v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
36+
v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
37+
v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
38+
v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
39+
return v0;
40+
}
41+
42+
static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
43+
float s1;
44+
__vr v1, v2, v3, v4;
45+
v4 = _vel_vrcps_vvl(v0, l);
46+
s1 = 1.0;
47+
v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
48+
v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
49+
v1 = _vel_vfmuls_vsvl(s0, v2, l);
50+
v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
51+
v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
52+
v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
53+
v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
54+
return v0;
55+
}
56+
57+
static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
58+
float s1;
59+
__vr v1, v2;
60+
s1 = 1.0f / s0;
61+
v1 = _vel_vfmuls_vsvl(s1, v0, l);
62+
v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
63+
v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
64+
return v0;
65+
}
66+
67+
static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
68+
__vr v1, v2, v3;
69+
v2 = _vel_vrcpd_vvl(v0, l);
70+
double s1 = 1.0;
71+
v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
72+
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
73+
v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
74+
v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
75+
v1 = _vel_vaddul_vsvl(1, v1, l);
76+
v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
77+
v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
78+
v1 = _vel_vfmuld_vsvl(s0, v3, l);
79+
v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
80+
v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
81+
return v0;
82+
}
83+
84+
static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
85+
double s0, s1;
86+
__vr v1, v2, v3;
87+
v2 = _vel_vrsqrtdnex_vvl(v0, l);
88+
v1 = _vel_vfmuld_vvvl(v0, v2, l);
89+
s0 = 1.0;
90+
s1 = 0.5;
91+
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
92+
v3 = _vel_vfmuld_vsvl(s1, v3, l);
93+
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
94+
v1 = _vel_vfmuld_vvvl(v0, v2, l);
95+
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
96+
v3 = _vel_vfmuld_vsvl(s1, v3, l);
97+
v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
98+
return v0;
99+
}
100+
101+
static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
102+
float s0, s1;
103+
__vr v1, v2, v3;
104+
v0 = _vel_vcvtds_vvl(v0, l);
105+
v2 = _vel_vrsqrtdnex_vvl(v0, l);
106+
v1 = _vel_vfmuld_vvvl(v0, v2, l);
107+
s0 = 1.0;
108+
s1 = 0.5;
109+
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
110+
v3 = _vel_vfmuld_vsvl(s1, v3, l);
111+
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
112+
v1 = _vel_vfmuld_vvvl(v0, v2, l);
113+
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
114+
v3 = _vel_vfmuld_vsvl(s1, v3, l);
115+
v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
116+
v0 = _vel_vcvtsd_vvl(v0, l);
117+
return v0;
118+
}
119+
120+
#endif

0 commit comments

Comments
 (0)