Skip to content

Commit deaf0a7

Browse files
committed
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinscs
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are more valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
1 parent 83c1d00 commit deaf0a7

File tree

10 files changed

+190
-165
lines changed

10 files changed

+190
-165
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
259259
def OP_VCVT_F32_BF16_HI
260260
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
261261

262-
def OP_VCVT_BF16_F32_LO_A64
263-
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
264-
def OP_VCVT_BF16_F32_A64
265-
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
266-
267262
def OP_VCVT_BF16_F32_A32
268263
: Op<(call "__a32_vcvt_bf16", $p0)>;
269264

@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
20612056
}
20622057

20632058
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
2064-
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
2065-
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
2059+
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
20662060
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
2067-
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
2061+
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
20682062

20692063
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
20702064
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7319,7 +7319,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
73197319
};
73207320

73217321
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
7322-
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
73237322
NEONMAP0(splat_lane_v),
73247323
NEONMAP0(splat_laneq_v),
73257324
NEONMAP0(splatq_lane_v),
@@ -7419,7 +7418,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
74197418
NEONMAP0(vcvtq_f16_s16),
74207419
NEONMAP0(vcvtq_f16_u16),
74217420
NEONMAP0(vcvtq_f32_v),
7422-
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
7421+
NEONMAP0(vcvtq_high_bf16_f32),
7422+
NEONMAP0(vcvtq_low_bf16_f32),
74237423
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
74247424
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
74257425
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7628,7 +7628,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
76287628
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
76297629
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
76307630
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7631-
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
7631+
NEONMAP0(vcvth_bf16_f32),
76327632
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
76337633
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
76347634
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12095,6 +12095,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1209512095
return ConstantInt::get(Builder.getInt32Ty(), 0);
1209612096
}
1209712097

12098+
if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
12099+
return Builder.CreateFPTrunc(
12100+
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
12101+
Builder.getFloatTy()),
12102+
Builder.getBFloatTy());
12103+
1209812104
// Handle MSVC intrinsics before argument evaluation to prevent double
1209912105
// evaluation.
1210012106
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12820,6 +12826,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1282012826
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
1282112827
"vgetq_lane");
1282212828
}
12829+
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
12830+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12831+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12832+
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12833+
}
12834+
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
12835+
SmallVector<int, 16> ConcatMask(8);
12836+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12837+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12838+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12839+
llvm::Value *Trunc =
12840+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12841+
return Builder.CreateShuffleVector(
12842+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
12843+
}
12844+
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
12845+
SmallVector<int, 16> ConcatMask(8);
12846+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12847+
SmallVector<int, 16> LoMask(4);
12848+
std::iota(LoMask.begin(), LoMask.end(), 0);
12849+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12850+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12851+
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
12852+
llvm::Value *Inactive = Builder.CreateShuffleVector(
12853+
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
12854+
llvm::Value *Trunc =
12855+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
12856+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
12857+
}
1282312858

1282412859
case clang::AArch64::BI_InterlockedAdd:
1282512860
case clang::AArch64::BI_InterlockedAdd64: {

clang/test/CodeGen/arm-bf16-convert-intrinsics.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
223223
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
224224
// CHECK-A64-NEXT: entry:
225225
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
226-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
227-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
228-
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229-
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
226+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
227+
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
230228
//
231229
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
232230
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
263261
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
264262
// CHECK-A64-NEXT: entry:
265263
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
267-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
268-
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
264+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
265+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
266+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
269267
//
270268
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
271269
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
323321
// CHECK-A64-NEXT: entry:
324322
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
325323
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
326-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
327-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
328-
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
324+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
325+
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
326+
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
327+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
329328
//
330329
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
331330
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
404403

405404
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
406405
// CHECK-A64-NEXT: entry:
407-
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
408-
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
406+
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
407+
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
409408
//
410409
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
411410
// CHECK-A32-HARDFP-NEXT: entry:

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
538538
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
539539
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
540540

541-
542-
// v8.6-A Bfloat Intrinsics
543-
def int_aarch64_neon_bfcvt
544-
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
545-
def int_aarch64_neon_bfcvtn
546-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
547-
def int_aarch64_neon_bfcvtn2
548-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
549-
[llvm_v8bf16_ty, llvm_v4f32_ty],
550-
[IntrNoMem]>;
551-
552541
// v8.2-A FP16 Fused Multiply-Add Long
553542
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
554543
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "llvm/Support/Regex.h"
4646
#include "llvm/TargetParser/Triple.h"
4747
#include <cstring>
48+
#include <numeric>
4849

4950
using namespace llvm;
5051

@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
828829
return true;
829830
}
830831
}
832+
833+
// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
834+
if (Name.starts_with("bfcvt")) {
835+
NewFn = nullptr;
836+
return true;
837+
}
838+
831839
return false; // No other 'aarch64.neon.*'.
832840
}
833841
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
40644072

40654073
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
40664074
Function *F, IRBuilder<> &Builder) {
4067-
Intrinsic::ID NewID =
4068-
StringSwitch<Intrinsic::ID>(Name)
4069-
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4070-
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4071-
.Default(Intrinsic::not_intrinsic);
4072-
if (NewID == Intrinsic::not_intrinsic)
4073-
llvm_unreachable("Unhandled Intrinsic!");
4074-
4075-
SmallVector<Value *, 3> Args(CI->args());
4076-
4077-
// The original intrinsics incorrectly used a predicate based on the smallest
4078-
// element type rather than the largest.
4079-
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4080-
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4081-
4082-
if (Args[1]->getType() != BadPredTy)
4083-
llvm_unreachable("Unexpected predicate type!");
4084-
4085-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4086-
BadPredTy, Args[1]);
4087-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
4088-
GoodPredTy, Args[1]);
4089-
4090-
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4091-
CI->getName());
4075+
if (Name.starts_with("neon.bfcvt")) {
4076+
if (Name.starts_with("neon.bfcvtn2")) {
4077+
SmallVector<int, 32> LoMask(4);
4078+
std::iota(LoMask.begin(), LoMask.end(), 0);
4079+
SmallVector<int, 32> ConcatMask(8);
4080+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4081+
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
4082+
Value *Trunc =
4083+
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
4084+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
4085+
} else if (Name.starts_with("neon.bfcvtn")) {
4086+
SmallVector<int, 32> ConcatMask(8);
4087+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4088+
Type *V4BF16 =
4089+
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
4090+
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
4091+
dbgs() << "Trunc: " << *Trunc << "\n";
4092+
return Builder.CreateShuffleVector(
4093+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
4094+
} else {
4095+
return Builder.CreateFPTrunc(CI->getOperand(0),
4096+
Type::getBFloatTy(F->getContext()));
4097+
}
4098+
} else if (Name.starts_with("sve.fcvt")) {
4099+
Intrinsic::ID NewID =
4100+
StringSwitch<Intrinsic::ID>(Name)
4101+
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4102+
.Case("sve.fcvtnt.bf16f32",
4103+
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4104+
.Default(Intrinsic::not_intrinsic);
4105+
if (NewID == Intrinsic::not_intrinsic)
4106+
llvm_unreachable("Unhandled Intrinsic!");
4107+
4108+
SmallVector<Value *, 3> Args(CI->args());
4109+
4110+
// The original intrinsics incorrectly used a predicate based on the
4111+
// smallest element type rather than the largest.
4112+
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4113+
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4114+
4115+
if (Args[1]->getType() != BadPredTy)
4116+
llvm_unreachable("Unexpected predicate type!");
4117+
4118+
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4119+
BadPredTy, Args[1]);
4120+
Args[1] = Builder.CreateIntrinsic(
4121+
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
4122+
4123+
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4124+
CI->getName());
4125+
}
4126+
4127+
llvm_unreachable("Unhandled Intrinsic!");
40924128
}
40934129

40944130
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9045,22 +9045,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
90459045

90469046
let mayRaiseFPException = 1, Uses = [FPCR] in
90479047
class SIMD_BFCVTN
9048-
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
9048+
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
90499049
"bfcvtn", ".4h", ".4s",
9050-
[(set (v8bf16 V128:$Rd),
9051-
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
9050+
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
90529051

90539052
let mayRaiseFPException = 1, Uses = [FPCR] in
90549053
class SIMD_BFCVTN2
90559054
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
9056-
"bfcvtn2", ".8h", ".4s",
9057-
[(set (v8bf16 V128:$dst),
9058-
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
9055+
"bfcvtn2", ".8h", ".4s", []>;
90599056

90609057
let mayRaiseFPException = 1, Uses = [FPCR] in
90619058
class BF16ToSinglePrecision<string asm>
90629059
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
9063-
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
9060+
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
90649061
Sched<[WriteFCvt]> {
90659062
bits<5> Rd;
90669063
bits<5> Rn;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,8 +1446,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
14461446
def BFCVTN : SIMD_BFCVTN;
14471447
def BFCVTN2 : SIMD_BFCVTN2;
14481448

1449-
def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
1450-
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
1449+
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
1450+
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
14511451

14521452
// Vector-scalar BFDOT:
14531453
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1469,8 +1469,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
14691469

14701470
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
14711471
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
1472-
// Round FP32 to BF16.
1473-
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
14741472
}
14751473

14761474
// ARMv8.6A AArch64 matrix multiplication
@@ -10415,9 +10413,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
1041510413
let Predicates = [HasBF16] in
1041610414
def : Pat<(InOp (v8bf16 V128:$Rn)),
1041710415
(v8bf16 (BFCVTN2
10418-
(v8bf16 (BFCVTN
10419-
(v4f32 (OutInst
10420-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10416+
(INSERT_SUBREG (IMPLICIT_DEF),
10417+
(v4bf16 (BFCVTN
10418+
(v4f32 (OutInst
10419+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10420+
dsub),
1042110421
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
1042210422

1042310423
let Predicates = [HasNoBF16] in
@@ -10452,10 +10452,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
1045210452
let Predicates = [HasBF16] in
1045310453
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
1045410454
(v8bf16 (BFCVTN2
10455-
(v8bf16 (BFCVTN
10456-
(v4f32 (OutInst
10457-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10458-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10455+
(INSERT_SUBREG (IMPLICIT_DEF),
10456+
(v4bf16 (BFCVTN
10457+
(v4f32 (OutInst
10458+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10459+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10460+
dsub),
1045910461
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
1046010462
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
1046110463

llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
22

3+
; This test acts to test the old neon.bfcvt intrinsics, which are now
4+
; autoupgraded to fptrunc operations.
5+
36
declare bfloat @llvm.aarch64.neon.bfcvt(float)
47
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
58
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)

0 commit comments

Comments
 (0)