Skip to content

Commit 55448ab

Browse files
[AArch64] Adding Neon Polynomial vadd Intrinsics
This patch adds the following intrinsics: vadd_p8 vadd_p16 vadd_p64 vaddq_p8 vaddq_p16 vaddq_p64 vaddq_p128 Reviewed By: t.p.northover, DavidSpickett, ctetreau Differential Revision: https://reviews.llvm.org/D96825
1 parent 27566e9 commit 55448ab

File tree

3 files changed

+110
-0
lines changed

3 files changed

+110
-0
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,6 +1147,8 @@ def SM4E : SInst<"vsm4e", "...", "QUi">;
11471147
def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">;
11481148
}
11491149

1150+
def VADDP : WInst<"vadd", "...", "PcPsPlQPcQPsQPlQPk">;
1151+
11501152
////////////////////////////////////////////////////////////////////////////////
11511153
// Float -> Int conversions with explicit rounding mode
11521154

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5371,7 +5371,10 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
53715371
NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
53725372
NEONMAP1(vabs_v, arm_neon_vabs, 0),
53735373
NEONMAP1(vabsq_v, arm_neon_vabs, 0),
5374+
NEONMAP0(vadd_v),
53745375
NEONMAP0(vaddhn_v),
5376+
NEONMAP0(vaddq_p128),
5377+
NEONMAP0(vaddq_v),
53755378
NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
53765379
NEONMAP1(vaeseq_v, arm_neon_aese, 0),
53775380
NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
@@ -5665,7 +5668,10 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
56655668
NEONMAP0(splatq_laneq_v),
56665669
NEONMAP1(vabs_v, aarch64_neon_abs, 0),
56675670
NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
5671+
NEONMAP0(vadd_v),
56685672
NEONMAP0(vaddhn_v),
5673+
NEONMAP0(vaddq_p128),
5674+
NEONMAP0(vaddq_v),
56695675
NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
56705676
NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
56715677
NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
@@ -6302,6 +6308,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
63026308
if (VTy->getElementType()->isFloatingPointTy())
63036309
return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
63046310
return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
6311+
case NEON::BI__builtin_neon_vadd_v:
6312+
case NEON::BI__builtin_neon_vaddq_v: {
6313+
llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
6314+
Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
6315+
Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
6316+
Ops[0] = Builder.CreateXor(Ops[0], Ops[1]);
6317+
return Builder.CreateBitCast(Ops[0], Ty);
6318+
}
63056319
case NEON::BI__builtin_neon_vaddhn_v: {
63066320
llvm::FixedVectorType *SrcTy =
63076321
llvm::FixedVectorType::getExtendedElementVectorType(VTy);
@@ -9543,6 +9557,15 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
95439557
case NEON::BI__builtin_neon_vabsh_f16:
95449558
Ops.push_back(EmitScalarExpr(E->getArg(0)));
95459559
return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
9560+
case NEON::BI__builtin_neon_vaddq_p128: {
9561+
llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
9562+
Ops.push_back(EmitScalarExpr(E->getArg(1)));
9563+
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
9564+
Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
9565+
Ops[0] = Builder.CreateXor(Ops[0], Ops[1]);
9566+
llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
9567+
return Builder.CreateBitCast(Ops[0], Int128Ty);
9568+
}
95469569
case NEON::BI__builtin_neon_vldrq_p128: {
95479570
llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
95489571
llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);

clang/test/CodeGen/aarch64-poly-add.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: aarch64-registered-target
3+
// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
4+
// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg \
5+
// RUN: | FileCheck %s
6+
7+
#include <arm_neon.h>
8+
9+
// CHECK-LABEL: @test_vadd_p8(
10+
// CHECK-NEXT: entry:
11+
// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[A:%.*]], [[B:%.*]]
12+
// CHECK-NEXT: ret <8 x i8> [[TMP0]]
13+
//
14+
poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) {
15+
return vadd_p8 (a, b);
16+
}
17+
18+
// CHECK-LABEL: @test_vadd_p16(
19+
// CHECK-NEXT: entry:
20+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
21+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
22+
// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
23+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
24+
// CHECK-NEXT: ret <4 x i16> [[TMP3]]
25+
//
26+
poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
27+
return vadd_p16 (a, b);
28+
}
29+
30+
// CHECK-LABEL: @test_vadd_p64(
31+
// CHECK-NEXT: entry:
32+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
33+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
34+
// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
35+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
36+
// CHECK-NEXT: ret <1 x i64> [[TMP3]]
37+
//
38+
poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) {
39+
return vadd_p64(a, b);
40+
}
41+
42+
// CHECK-LABEL: @test_vaddq_p8(
43+
// CHECK-NEXT: entry:
44+
// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]]
45+
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
46+
//
47+
poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){
48+
return vaddq_p8(a, b);
49+
}
50+
51+
// CHECK-LABEL: @test_vaddq_p16(
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
54+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
55+
// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
56+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
57+
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
58+
//
59+
poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
60+
return vaddq_p16(a, b);
61+
}
62+
63+
// CHECK-LABEL: @test_vaddq_p64(
64+
// CHECK-NEXT: entry:
65+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
66+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
67+
// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
68+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
69+
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
70+
//
71+
poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
72+
return vaddq_p64(a, b);
73+
}
74+
75+
// CHECK-LABEL: @test_vaddq_p128(
76+
// CHECK-NEXT: entry:
77+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
78+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[B:%.*]] to <16 x i8>
79+
// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
80+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
81+
// CHECK-NEXT: ret i128 [[TMP3]]
82+
//
83+
poly128_t test_vaddq_p128 (poly128_t a, poly128_t b){
84+
return vaddq_p128(a, b);
85+
}

0 commit comments

Comments
 (0)