Skip to content

Commit c511cc0

Browse files
[AArch64] Implement NEON vscale intrinsics (llvm#100347)
This patch implements following intrinsics: ``` float16x4_t vscale_f16(float16x4_t vn, int16x4_t vm) float16x8_t vscaleq_f16(float16x8_t vn, int16x8_t vm) float32x2_t vscale_f32(float32x2_t vn, int32x2_t vm) float32x4_t vscaleq_f32(float32x4_t vn, int32x4_t vm) float64x2_t vscaleq_f64(float64x2_t vn, int64x2_t vm) ``` as defined in ARM-software/acle#323 Co-authored-by: Hassnaa Hamdi <[email protected]>
1 parent 9c48a04 commit c511cc0

File tree

7 files changed

+154
-1
lines changed

7 files changed

+154
-1
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,3 +2126,9 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
21262126
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
21272127
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
21282128
}
2129+
2130+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
2131+
// fscale
2132+
def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
2133+
def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
2134+
}

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13573,6 +13573,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1357313573
Int = Intrinsic::aarch64_neon_famax;
1357413574
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famax");
1357513575
}
13576+
case NEON::BI__builtin_neon_vscale_f16:
13577+
case NEON::BI__builtin_neon_vscaleq_f16:
13578+
case NEON::BI__builtin_neon_vscale_f32:
13579+
case NEON::BI__builtin_neon_vscaleq_f32:
13580+
case NEON::BI__builtin_neon_vscaleq_f64: {
13581+
Int = Intrinsic::aarch64_neon_fp8_fscale;
13582+
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale");
13583+
}
1357613584
}
1357713585
}
1357813586

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
2+
#include <arm_neon.h>
3+
4+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s
5+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s
6+
7+
// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16(
8+
// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
9+
// CHECK-NEXT: entry:
10+
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]])
11+
// CHECK-NEXT: ret <4 x half> [[FSCALE2_I]]
12+
//
13+
float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) {
14+
return vscale_f16(vn, vm);
15+
}
16+
17+
// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16(
18+
// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
19+
// CHECK-NEXT: entry:
20+
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]])
21+
// CHECK-NEXT: ret <8 x half> [[FSCALE2_I]]
22+
//
23+
float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) {
24+
return vscaleq_f16(vn, vm);
25+
26+
}
27+
28+
// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32(
29+
// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
30+
// CHECK-NEXT: entry:
31+
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]])
32+
// CHECK-NEXT: ret <2 x float> [[FSCALE2_I]]
33+
//
34+
float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) {
35+
return vscale_f32(vn, vm);
36+
37+
}
38+
39+
// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32(
40+
// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
41+
// CHECK-NEXT: entry:
42+
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]])
43+
// CHECK-NEXT: ret <4 x float> [[FSCALE2_I]]
44+
//
45+
float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) {
46+
return vscaleq_f32(vn, vm);
47+
48+
}
49+
50+
// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64(
51+
// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]])
54+
// CHECK-NEXT: ret <2 x double> [[FSCALE2_I]]
55+
//
56+
float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) {
57+
return vscaleq_f64(vn, vm);
58+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
563563
def int_aarch64_neon_vcmla_rot90 : AdvSIMD_3VectorArg_Intrinsic;
564564
def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
565565
def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
566+
567+
// FP8 fscale
568+
def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
569+
[llvm_anyvector_ty],
570+
[LLVMMatchType<0>,
571+
LLVMVectorOfBitcastsToInt<0>],
572+
[IntrNoMem]>;
566573
}
567574

568575
let TargetPrefix = "aarch64" in {

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6243,6 +6243,26 @@ multiclass SIMDThreeSameVectorDOT4<string asm> {
62436243
V128, v4f32, v16i8, null_frag>;
62446244
}
62456245

6246+
let mayRaiseFPException = 1, Uses = [FPCR] in
6247+
multiclass SIMDThreeVectorFscale<bit U, bit S, bits<3> opc,
6248+
string asm, SDPatternOperator OpNode> {
6249+
def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
6250+
asm, ".4h",
6251+
[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>;
6252+
def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
6253+
asm, ".8h",
6254+
[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>;
6255+
def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
6256+
asm, ".2s",
6257+
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>;
6258+
def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
6259+
asm, ".4s",
6260+
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>;
6261+
def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
6262+
asm, ".2d",
6263+
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>;
6264+
}
6265+
62466266
//----------------------------------------------------------------------------
62476267
// AdvSIMD two register vector instructions.
62486268
//----------------------------------------------------------------------------

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10136,7 +10136,7 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
1013610136
defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
1013710137
defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
1013810138
defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
10139-
defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
10139+
defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
1014010140
} // End let Predicates = [HasFP8]
1014110141

1014210142
// fminimum(abs(a), abs(b)) -> famin(a, b)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
3+
4+
5+
define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) {
6+
; CHECK-LABEL: test_fscale_f16:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: fscale v0.4h, v0.4h, v1.4h
9+
; CHECK-NEXT: ret
10+
%res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm)
11+
ret <4 x half> %res
12+
}
13+
14+
define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) {
15+
; CHECK-LABEL: test_fscaleq_f16:
16+
; CHECK: // %bb.0:
17+
; CHECK-NEXT: fscale v0.8h, v0.8h, v1.8h
18+
; CHECK-NEXT: ret
19+
%res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm)
20+
ret <8 x half> %res
21+
}
22+
23+
define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) {
24+
; CHECK-LABEL: test_fscale_f32:
25+
; CHECK: // %bb.0:
26+
; CHECK-NEXT: fscale v0.2s, v0.2s, v1.2s
27+
; CHECK-NEXT: ret
28+
%res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm)
29+
ret <2 x float> %res
30+
}
31+
32+
define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) {
33+
; CHECK-LABEL: test_fscaleq_f32:
34+
; CHECK: // %bb.0:
35+
; CHECK-NEXT: fscale v0.4s, v0.4s, v1.4s
36+
; CHECK-NEXT: ret
37+
%res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm)
38+
ret <4 x float> %res
39+
}
40+
41+
define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) {
42+
; CHECK-LABEL: test_fscaleq_f64:
43+
; CHECK: // %bb.0:
44+
; CHECK-NEXT: fscale v0.2d, v0.2d, v1.2d
45+
; CHECK-NEXT: ret
46+
%res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm)
47+
ret <2 x double> %res
48+
}
49+
50+
declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>)
51+
declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>)
52+
declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>)
53+
declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>)
54+
declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>)

0 commit comments

Comments
 (0)