Skip to content

Commit b08d2dd

Browse files
committed
[ARM,MVE] Add ACLE intrinsics for VCVT.F32.F16 family.
Summary: These instructions make a vector of `<4 x float>` by widening every other lane of a vector of `<8 x half>`. I wondered about representing these using standard IR, along the lines of a shufflevector to extract elements of the input into a `<4 x half>` followed by an `fpext` to turn that into `<4 x float>`. But it looks as if that would take a lot of work in isel lowering to make it match any pattern I could sensibly write in Tablegen, and also I haven't been able to think of any other case where that pattern might be generated in IR, so there wouldn't be any extra code generation win from doing it that way. Therefore, I've just used another target-specific intrinsic. We can always change it to the other way later if anyone thinks of a good reason. (In order to put the intrinsic definition near similar things in `IntrinsicsARM.td`, I've also lifted the definition of the `MVEMXPredicated` multiclass higher up the file, without changing it.) Reviewers: MarkMurrayARM, dmgreen, miyuki, ostannard Reviewed By: miyuki Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D75254
1 parent 69441e5 commit b08d2dd

File tree

5 files changed

+152
-12
lines changed

5 files changed

+152
-12
lines changed

clang/include/clang/Basic/arm_mve.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,15 @@ foreach half = [ "b", "t" ] in {
453453
VecOf<f16>, (args VecOf<f16>:$inactive, Vector:$a, PredOf<f32>:$pred),
454454
(IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>;
455455
} // params = [f32], pnt = PNT_None
456+
457+
let params = [f16], pnt = PNT_None in {
458+
def vcvt#half#q_f32: Intrinsic<VecOf<f32>, (args Vector:$a),
459+
(IRInt<"vcvt_widen"> $a, halfconst)>;
460+
defm vcvt#half#q: IntrinsicMX<
461+
VecOf<f32>, (args Vector:$a, PredOf<f32>:$pred),
462+
(IRInt<"vcvt_widen_predicated"> $inactive, $a, halfconst, $pred),
463+
1, "_f32">;
464+
} // params = [f16], pnt = PNT_None
456465
} // loop over half = "b", "t"
457466

458467
multiclass float_int_conversions<Type FScalar, Type IScalar, IRBuilderBase ftoi, IRBuilderBase itof> {

clang/test/CodeGen/arm-mve-intrinsics/vcvt.c

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,3 +697,71 @@ uint32x4_t test_vcvtq_x_n_u32_f32(float32x4_t a, mve_pred16_t p)
697697
{
698698
return vcvtq_x_n_u32_f32(a, 32, p);
699699
}
700+
701+
// CHECK-LABEL: @test_vcvtbq_f32_f16(
702+
// CHECK-NEXT: entry:
703+
// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 0)
704+
// CHECK-NEXT: ret <4 x float> [[TMP0]]
705+
//
706+
float32x4_t test_vcvtbq_f32_f16(float16x8_t a)
707+
{
708+
return vcvtbq_f32_f16(a);
709+
}
710+
711+
// CHECK-LABEL: @test_vcvttq_f32_f16(
712+
// CHECK-NEXT: entry:
713+
// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 1)
714+
// CHECK-NEXT: ret <4 x float> [[TMP0]]
715+
//
716+
float32x4_t test_vcvttq_f32_f16(float16x8_t a)
717+
{
718+
return vcvttq_f32_f16(a);
719+
}
720+
721+
// CHECK-LABEL: @test_vcvtbq_m_f32_f16(
722+
// CHECK-NEXT: entry:
723+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
724+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
725+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
726+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
727+
//
728+
float32x4_t test_vcvtbq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
729+
{
730+
return vcvtbq_m_f32_f16(inactive, a, p);
731+
}
732+
733+
// CHECK-LABEL: @test_vcvttq_m_f32_f16(
734+
// CHECK-NEXT: entry:
735+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
736+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
737+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
738+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
739+
//
740+
float32x4_t test_vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p)
741+
{
742+
return vcvttq_m_f32_f16(inactive, a, p);
743+
}
744+
745+
// CHECK-LABEL: @test_vcvtbq_x_f32_f16(
746+
// CHECK-NEXT: entry:
747+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
748+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
749+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]])
750+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
751+
//
752+
float32x4_t test_vcvtbq_x_f32_f16(float16x8_t a, mve_pred16_t p)
753+
{
754+
return vcvtbq_x_f32_f16(a, p);
755+
}
756+
757+
// CHECK-LABEL: @test_vcvttq_x_f32_f16(
758+
// CHECK-NEXT: entry:
759+
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
760+
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
761+
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]])
762+
// CHECK-NEXT: ret <4 x float> [[TMP2]]
763+
//
764+
float32x4_t test_vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p)
765+
{
766+
return vcvttq_x_f32_f16(a, p);
767+
}

llvm/include/llvm/IR/IntrinsicsARM.td

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -911,8 +911,22 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
911911
LLVMMatchType<0>, rets[0])], props>;
912912
}
913913

914+
// Intrinsic with a predicated and a non-predicated case. The predicated case
915+
// has two additional parameters: inactive (the value for inactive lanes, can
916+
// be undef) and predicate.
917+
multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
918+
list<LLVMType> params, LLVMType inactive,
919+
LLVMType predicate,
920+
list<IntrinsicProperty> props = [IntrNoMem]> {
921+
def "": Intrinsic<rets, flags # params, props>;
922+
def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
923+
props>;
924+
}
925+
914926
defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
915927
[llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
928+
defm int_arm_mve_vcvt_widen: MVEMXPredicated<[llvm_v4f32_ty], [],
929+
[llvm_v8f16_ty, llvm_i32_ty], llvm_v4f32_ty, llvm_v4i1_ty>;
916930

917931
defm int_arm_mve_vldr_gather_base: MVEPredicated<
918932
[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
@@ -1044,18 +1058,6 @@ def int_arm_mve_vmull_poly: Intrinsic<
10441058
[llvm_anyvector_ty],
10451059
[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
10461060

1047-
// Intrinsic with a predicated and a non-predicated case. The predicated case
1048-
// has two additional parameters: inactive (the value for inactive lanes, can
1049-
// be undef) and predicate.
1050-
multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
1051-
list<LLVMType> params, LLVMType inactive,
1052-
LLVMType predicate,
1053-
list<IntrinsicProperty> props = [IntrNoMem]> {
1054-
def "": Intrinsic<rets, flags # params, props>;
1055-
def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
1056-
props>;
1057-
}
1058-
10591061
// The first two parameters are compile-time constants:
10601062
// * Halving: 0 means halving (vhcaddq), 1 means non-halving (vcaddq)
10611063
// instruction. Note: the flag is inverted to match the corresonding

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4515,6 +4515,17 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
45154515

45164516
multiclass MVE_VCVT_h2f_m<string iname, int half> {
45174517
def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
4518+
defvar Inst = !cast<Instruction>(NAME);
4519+
4520+
let Predicates = [HasMVEFloat] in {
4521+
def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
4522+
(v4f32 (Inst (v8f16 MQPR:$Qm)))>;
4523+
def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
4524+
(v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
4525+
(v4i1 VCCR:$mask))),
4526+
(v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
4527+
(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
4528+
}
45184529
}
45194530

45204531
defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;

llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
66

77
declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32)
88
declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>)
9+
declare <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half>, i32)
10+
declare <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float>, <8 x half>, i32, <4 x i1>)
911

1012
declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32)
1113
declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32)
@@ -367,3 +369,51 @@ entry:
367369
%2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1)
368370
ret <4 x i32> %2
369371
}
372+
373+
define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_f32_f16(<8 x half> %a) {
374+
; CHECK-LABEL: test_vcvtbq_f32_f16:
375+
; CHECK: @ %bb.0: @ %entry
376+
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
377+
; CHECK-NEXT: bx lr
378+
entry:
379+
%0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 0)
380+
ret <4 x float> %0
381+
}
382+
383+
define arm_aapcs_vfpcc <4 x float> @test_vcvttq_f32_f16(<8 x half> %a) {
384+
; CHECK-LABEL: test_vcvttq_f32_f16:
385+
; CHECK: @ %bb.0: @ %entry
386+
; CHECK-NEXT: vcvtt.f32.f16 q0, q0
387+
; CHECK-NEXT: bx lr
388+
entry:
389+
%0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 1)
390+
ret <4 x float> %0
391+
}
392+
393+
define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
394+
; CHECK-LABEL: test_vcvtbq_m_f32_f16:
395+
; CHECK: @ %bb.0: @ %entry
396+
; CHECK-NEXT: vmsr p0, r0
397+
; CHECK-NEXT: vpst
398+
; CHECK-NEXT: vcvtbt.f32.f16 q0, q1
399+
; CHECK-NEXT: bx lr
400+
entry:
401+
%0 = zext i16 %p to i32
402+
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
403+
%2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 0, <4 x i1> %1)
404+
ret <4 x float> %2
405+
}
406+
407+
define arm_aapcs_vfpcc <4 x float> @test_vcvttq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) {
408+
; CHECK-LABEL: test_vcvttq_m_f32_f16:
409+
; CHECK: @ %bb.0: @ %entry
410+
; CHECK-NEXT: vmsr p0, r0
411+
; CHECK-NEXT: vpst
412+
; CHECK-NEXT: vcvttt.f32.f16 q0, q1
413+
; CHECK-NEXT: bx lr
414+
entry:
415+
%0 = zext i16 %p to i32
416+
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
417+
%2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 1, <4 x i1> %1)
418+
ret <4 x float> %2
419+
}

0 commit comments

Comments
 (0)