Skip to content

Commit ff5b628

Browse files
committed
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinscs
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are more valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
1 parent edc0235 commit ff5b628

File tree

10 files changed

+190
-165
lines changed

10 files changed

+190
-165
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
259259
def OP_VCVT_F32_BF16_HI
260260
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
261261

262-
def OP_VCVT_BF16_F32_LO_A64
263-
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
264-
def OP_VCVT_BF16_F32_A64
265-
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
266-
267262
def OP_VCVT_BF16_F32_A32
268263
: Op<(call "__a32_vcvt_bf16", $p0)>;
269264

@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
20612056
}
20622057

20632058
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
2064-
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
2065-
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
2059+
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
20662060
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
2067-
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
2061+
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
20682062

20692063
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
20702064
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
73077307
};
73087308

73097309
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
7310-
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
73117310
NEONMAP0(splat_lane_v),
73127311
NEONMAP0(splat_laneq_v),
73137312
NEONMAP0(splatq_lane_v),
@@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
74077406
NEONMAP0(vcvtq_f16_s16),
74087407
NEONMAP0(vcvtq_f16_u16),
74097408
NEONMAP0(vcvtq_f32_v),
7410-
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
7409+
NEONMAP0(vcvtq_high_bf16_f32),
7410+
NEONMAP0(vcvtq_low_bf16_f32),
74117411
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
74127412
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
74137413
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
76167616
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
76177617
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
76187618
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7619-
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
7619+
NEONMAP0(vcvth_bf16_f32),
76207620
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
76217621
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
76227622
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1208312083
return ConstantInt::get(Builder.getInt32Ty(), 0);
1208412084
}
1208512085

12086+
if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
12087+
return Builder.CreateFPTrunc(
12088+
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
12089+
Builder.getFloatTy()),
12090+
Builder.getBFloatTy());
12091+
1208612092
// Handle MSVC intrinsics before argument evaluation to prevent double
1208712093
// evaluation.
1208812094
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1280812814
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
1280912815
"vgetq_lane");
1281012816
}
12817+
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
12818+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12819+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12820+
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12821+
}
12822+
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
12823+
SmallVector<int, 16> ConcatMask(8);
12824+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12825+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12826+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12827+
llvm::Value *Trunc =
12828+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12829+
return Builder.CreateShuffleVector(
12830+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
12831+
}
12832+
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
12833+
SmallVector<int, 16> ConcatMask(8);
12834+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12835+
SmallVector<int, 16> LoMask(4);
12836+
std::iota(LoMask.begin(), LoMask.end(), 0);
12837+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12838+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12839+
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
12840+
llvm::Value *Inactive = Builder.CreateShuffleVector(
12841+
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
12842+
llvm::Value *Trunc =
12843+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
12844+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
12845+
}
1281112846

1281212847
case clang::AArch64::BI_InterlockedAdd:
1281312848
case clang::AArch64::BI_InterlockedAdd64: {

clang/test/CodeGen/arm-bf16-convert-intrinsics.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
223223
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
224224
// CHECK-A64-NEXT: entry:
225225
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
226-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
227-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
228-
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229-
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
226+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
227+
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
230228
//
231229
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
232230
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
263261
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
264262
// CHECK-A64-NEXT: entry:
265263
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
267-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
268-
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
264+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
265+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
266+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
269267
//
270268
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
271269
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
323321
// CHECK-A64-NEXT: entry:
324322
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
325323
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
326-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
327-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
328-
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
324+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
325+
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
326+
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
327+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
329328
//
330329
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
331330
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
404403

405404
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
406405
// CHECK-A64-NEXT: entry:
407-
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
408-
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
406+
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
407+
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
409408
//
410409
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
411410
// CHECK-A32-HARDFP-NEXT: entry:

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
538538
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
539539
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
540540

541-
542-
// v8.6-A Bfloat Intrinsics
543-
def int_aarch64_neon_bfcvt
544-
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
545-
def int_aarch64_neon_bfcvtn
546-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
547-
def int_aarch64_neon_bfcvtn2
548-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
549-
[llvm_v8bf16_ty, llvm_v4f32_ty],
550-
[IntrNoMem]>;
551-
552541
// v8.2-A FP16 Fused Multiply-Add Long
553542
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
554543
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "llvm/Support/Regex.h"
4646
#include "llvm/TargetParser/Triple.h"
4747
#include <cstring>
48+
#include <numeric>
4849

4950
using namespace llvm;
5051

@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
828829
return true;
829830
}
830831
}
832+
833+
// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
834+
if (Name.starts_with("bfcvt")) {
835+
NewFn = nullptr;
836+
return true;
837+
}
838+
831839
return false; // No other 'aarch64.neon.*'.
832840
}
833841
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
40644072

40654073
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
40664074
Function *F, IRBuilder<> &Builder) {
4067-
Intrinsic::ID NewID =
4068-
StringSwitch<Intrinsic::ID>(Name)
4069-
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4070-
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4071-
.Default(Intrinsic::not_intrinsic);
4072-
if (NewID == Intrinsic::not_intrinsic)
4073-
llvm_unreachable("Unhandled Intrinsic!");
4074-
4075-
SmallVector<Value *, 3> Args(CI->args());
4076-
4077-
// The original intrinsics incorrectly used a predicate based on the smallest
4078-
// element type rather than the largest.
4079-
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4080-
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4081-
4082-
if (Args[1]->getType() != BadPredTy)
4083-
llvm_unreachable("Unexpected predicate type!");
4084-
4085-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4086-
BadPredTy, Args[1]);
4087-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
4088-
GoodPredTy, Args[1]);
4089-
4090-
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4091-
CI->getName());
4075+
if (Name.starts_with("neon.bfcvt")) {
4076+
if (Name.starts_with("neon.bfcvtn2")) {
4077+
SmallVector<int, 32> LoMask(4);
4078+
std::iota(LoMask.begin(), LoMask.end(), 0);
4079+
SmallVector<int, 32> ConcatMask(8);
4080+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4081+
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
4082+
Value *Trunc =
4083+
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
4084+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
4085+
} else if (Name.starts_with("neon.bfcvtn")) {
4086+
SmallVector<int, 32> ConcatMask(8);
4087+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4088+
Type *V4BF16 =
4089+
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
4090+
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
4091+
dbgs() << "Trunc: " << *Trunc << "\n";
4092+
return Builder.CreateShuffleVector(
4093+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
4094+
} else {
4095+
return Builder.CreateFPTrunc(CI->getOperand(0),
4096+
Type::getBFloatTy(F->getContext()));
4097+
}
4098+
} else if (Name.starts_with("sve.fcvt")) {
4099+
Intrinsic::ID NewID =
4100+
StringSwitch<Intrinsic::ID>(Name)
4101+
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4102+
.Case("sve.fcvtnt.bf16f32",
4103+
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4104+
.Default(Intrinsic::not_intrinsic);
4105+
if (NewID == Intrinsic::not_intrinsic)
4106+
llvm_unreachable("Unhandled Intrinsic!");
4107+
4108+
SmallVector<Value *, 3> Args(CI->args());
4109+
4110+
// The original intrinsics incorrectly used a predicate based on the
4111+
// smallest element type rather than the largest.
4112+
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4113+
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4114+
4115+
if (Args[1]->getType() != BadPredTy)
4116+
llvm_unreachable("Unexpected predicate type!");
4117+
4118+
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4119+
BadPredTy, Args[1]);
4120+
Args[1] = Builder.CreateIntrinsic(
4121+
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
4122+
4123+
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4124+
CI->getName());
4125+
}
4126+
4127+
llvm_unreachable("Unhandled Intrinsic!");
40924128
}
40934129

40944130
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
90539053

90549054
let mayRaiseFPException = 1, Uses = [FPCR] in
90559055
class SIMD_BFCVTN
9056-
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
9056+
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
90579057
"bfcvtn", ".4h", ".4s",
9058-
[(set (v8bf16 V128:$Rd),
9059-
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
9058+
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
90609059

90619060
let mayRaiseFPException = 1, Uses = [FPCR] in
90629061
class SIMD_BFCVTN2
90639062
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
9064-
"bfcvtn2", ".8h", ".4s",
9065-
[(set (v8bf16 V128:$dst),
9066-
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
9063+
"bfcvtn2", ".8h", ".4s", []>;
90679064

90689065
let mayRaiseFPException = 1, Uses = [FPCR] in
90699066
class BF16ToSinglePrecision<string asm>
90709067
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
9071-
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
9068+
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
90729069
Sched<[WriteFCvt]> {
90739070
bits<5> Rd;
90749071
bits<5> Rn;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,8 +1454,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
14541454
def BFCVTN : SIMD_BFCVTN;
14551455
def BFCVTN2 : SIMD_BFCVTN2;
14561456

1457-
def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
1458-
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
1457+
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
1458+
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
14591459

14601460
// Vector-scalar BFDOT:
14611461
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
14771477

14781478
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
14791479
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
1480-
// Round FP32 to BF16.
1481-
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
14821480
}
14831481

14841482
// ARMv8.6A AArch64 matrix multiplication
@@ -10410,9 +10408,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
1041010408
let Predicates = [HasBF16] in
1041110409
def : Pat<(InOp (v8bf16 V128:$Rn)),
1041210410
(v8bf16 (BFCVTN2
10413-
(v8bf16 (BFCVTN
10414-
(v4f32 (OutInst
10415-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10411+
(INSERT_SUBREG (IMPLICIT_DEF),
10412+
(v4bf16 (BFCVTN
10413+
(v4f32 (OutInst
10414+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10415+
dsub),
1041610416
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
1041710417

1041810418
let Predicates = [HasNoBF16] in
@@ -10447,10 +10447,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
1044710447
let Predicates = [HasBF16] in
1044810448
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
1044910449
(v8bf16 (BFCVTN2
10450-
(v8bf16 (BFCVTN
10451-
(v4f32 (OutInst
10452-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10453-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10450+
(INSERT_SUBREG (IMPLICIT_DEF),
10451+
(v4bf16 (BFCVTN
10452+
(v4f32 (OutInst
10453+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10454+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10455+
dsub),
1045410456
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
1045510457
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
1045610458

llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
22

3+
; This test acts to test the old neon.bfcvt intrinsics, which are now
4+
; autoupgraded to fptrunc operations.
5+
36
declare bfloat @llvm.aarch64.neon.bfcvt(float)
47
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
58
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)

0 commit comments

Comments
 (0)