Skip to content

[AArch64][GlobalISel] Lower scalarizing G_UNMERGE_VALUES to G_EXTRACT_VECTOR_ELT #75662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
form_duplane,
shuf_to_ins]>;

// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
def vector_unmerge_lowering : GICombineRule <
(defs root:$root),
(match (wip_match_opcode G_UNMERGE_VALUES):$root,
[{ return matchScalarizeVectorUnmerge(*${root}, MRI); }]),
(apply [{ applyScalarizeVectorUnmerge(*${root}, MRI, B); }])
>;

def adjust_icmp_imm_matchdata :
GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
def adjust_icmp_imm : GICombineRule <
Expand Down Expand Up @@ -251,7 +259,8 @@ def AArch64PostLegalizerLowering
icmp_lowering, build_vector_lowering,
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mull]> {
unmerge_ext_to_unmerge, lower_mull,
vector_unmerge_lowering]> {
}

// Post-legalization combines which are primarily optimizations.
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6480,23 +6480,23 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),
// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
(EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, 0)))),
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, ssub)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
(EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, 0)))),
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, dsub)>;

// Floating point vector extractions are codegen'd as either a sequence of
// subregister extractions, or a MOV (aka DUP here) if
// the lane number is anything other than zero.
def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
def : Pat<(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
def : Pat<(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
def : Pat<(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
def : Pat<(bf16 (vector_extract (v8bf16 V128:$Rn), (i64 0))),
(bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;


Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,28 @@ void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}

bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) {
assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GUnmerge *Unmerge = cast<GUnmerge>(&MI);

Register Src1Reg = MI.getOperand(MI.getNumOperands() - 1).getReg();
const LLT SrcTy = MRI.getType(Src1Reg);
return SrcTy.isVector() && !SrcTy.isScalable() &&
MI.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1;
}

void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) {
assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
Register Src1Reg = MI.getOperand(MI.getNumOperands() - 1).getReg();
const LLT SrcTy = MRI.getType(Src1Reg);
assert((SrcTy.isVector() && !SrcTy.isScalable()) &&
"Expected a fixed length vector");

for (int I = 0; I < SrcTy.getNumElements(); ++I) {
B.buildExtractVectorElementConstant(MI.getOperand(I).getReg(), Src1Reg, I);
}
MI.eraseFromParent();
}

bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) {
assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
auto Splat = getAArch64VectorSplat(MI, MRI);
Expand Down
176 changes: 76 additions & 100 deletions llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -516,20 +516,17 @@ define i8 @sminv_v4i8(<4 x i8> %a) {
; CHECK-GI-LABEL: sminv_v4i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: mov h3, v0.h[3]
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
; CHECK-GI-NEXT: umov w10, v0.h[2]
; CHECK-GI-NEXT: umov w12, v0.h[3]
; CHECK-GI-NEXT: sxtb w11, w8
; CHECK-GI-NEXT: cmp w11, w9, sxtb
; CHECK-GI-NEXT: sxtb w11, w10
; CHECK-GI-NEXT: csel w8, w8, w9, lt
; CHECK-GI-NEXT: cmp w11, w12, sxtb
; CHECK-GI-NEXT: sxtb w9, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: sxtb w9, w11
; CHECK-GI-NEXT: csel w8, w8, w10, lt
; CHECK-GI-NEXT: fmov w10, s3
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: sxtb w9, w8
; CHECK-GI-NEXT: csel w10, w11, w10, lt
; CHECK-GI-NEXT: csel w10, w10, w12, lt
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: csel w0, w8, w10, lt
; CHECK-GI-NEXT: ret
Expand Down Expand Up @@ -611,19 +608,16 @@ define i16 @sminv_v3i16(<3 x i16> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: smov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[0]
; CHECK-GI-NEXT: umov w10, v0.h[1]
; CHECK-GI-NEXT: smov w11, v0.h[2]
; CHECK-GI-NEXT: umov w13, v0.h[2]
; CHECK-GI-NEXT: fmov w12, s1
; CHECK-GI-NEXT: cmp w8, w10, sxth
; CHECK-GI-NEXT: sxth w8, w11
; CHECK-GI-NEXT: fmov w10, s2
; CHECK-GI-NEXT: csel w9, w9, w12, lt
; CHECK-GI-NEXT: cmp w8, w9, sxth
; CHECK-GI-NEXT: csel w0, w9, w10, gt
; CHECK-GI-NEXT: cmp w8, w12, sxth
; CHECK-GI-NEXT: csel w8, w9, w10, lt
; CHECK-GI-NEXT: cmp w11, w8, sxth
; CHECK-GI-NEXT: csel w0, w8, w13, gt
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
Expand Down Expand Up @@ -887,20 +881,17 @@ define i8 @smaxv_v4i8(<4 x i8> %a) {
; CHECK-GI-LABEL: smaxv_v4i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: mov h3, v0.h[3]
; CHECK-GI-NEXT: sxtb w9, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: sxtb w9, w11
; CHECK-GI-NEXT: csel w8, w8, w10, gt
; CHECK-GI-NEXT: fmov w10, s3
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
; CHECK-GI-NEXT: umov w10, v0.h[2]
; CHECK-GI-NEXT: umov w12, v0.h[3]
; CHECK-GI-NEXT: sxtb w11, w8
; CHECK-GI-NEXT: cmp w11, w9, sxtb
; CHECK-GI-NEXT: sxtb w11, w10
; CHECK-GI-NEXT: csel w8, w8, w9, gt
; CHECK-GI-NEXT: cmp w11, w12, sxtb
; CHECK-GI-NEXT: sxtb w9, w8
; CHECK-GI-NEXT: csel w10, w11, w10, gt
; CHECK-GI-NEXT: csel w10, w10, w12, gt
; CHECK-GI-NEXT: cmp w9, w10, sxtb
; CHECK-GI-NEXT: csel w0, w8, w10, gt
; CHECK-GI-NEXT: ret
Expand Down Expand Up @@ -982,19 +973,16 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: smov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[0]
; CHECK-GI-NEXT: umov w10, v0.h[1]
; CHECK-GI-NEXT: smov w11, v0.h[2]
; CHECK-GI-NEXT: umov w13, v0.h[2]
; CHECK-GI-NEXT: fmov w12, s1
; CHECK-GI-NEXT: cmp w8, w10, sxth
; CHECK-GI-NEXT: sxth w8, w11
; CHECK-GI-NEXT: fmov w10, s2
; CHECK-GI-NEXT: csel w9, w9, w12, gt
; CHECK-GI-NEXT: cmp w8, w9, sxth
; CHECK-GI-NEXT: csel w0, w9, w10, lt
; CHECK-GI-NEXT: cmp w8, w12, sxth
; CHECK-GI-NEXT: csel w8, w9, w10, gt
; CHECK-GI-NEXT: cmp w11, w8, sxth
; CHECK-GI-NEXT: csel w0, w8, w13, lt
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
Expand Down Expand Up @@ -1256,19 +1244,16 @@ define i8 @uminv_v4i8(<4 x i8> %a) {
; CHECK-GI-LABEL: uminv_v4i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: mov h3, v0.h[3]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: fmov w12, s3
; CHECK-GI-NEXT: and w9, w8, #0xff
; CHECK-GI-NEXT: cmp w9, w10, uxtb
; CHECK-GI-NEXT: and w9, w11, #0xff
; CHECK-GI-NEXT: csel w8, w8, w10, lo
; CHECK-GI-NEXT: cmp w9, w12, uxtb
; CHECK-GI-NEXT: csel w9, w11, w12, lo
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
; CHECK-GI-NEXT: umov w10, v0.h[2]
; CHECK-GI-NEXT: umov w11, v0.h[3]
; CHECK-GI-NEXT: and w12, w8, #0xff
; CHECK-GI-NEXT: cmp w12, w9, uxtb
; CHECK-GI-NEXT: and w12, w10, #0xff
; CHECK-GI-NEXT: csel w8, w8, w9, lo
; CHECK-GI-NEXT: cmp w12, w11, uxtb
; CHECK-GI-NEXT: csel w9, w10, w11, lo
; CHECK-GI-NEXT: and w10, w8, #0xff
; CHECK-GI-NEXT: cmp w10, w9, uxtb
; CHECK-GI-NEXT: csel w0, w8, w9, lo
Expand Down Expand Up @@ -1351,19 +1336,16 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: uxth w8, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[0]
; CHECK-GI-NEXT: umov w10, v0.h[1]
; CHECK-GI-NEXT: umov w11, v0.h[2]
; CHECK-GI-NEXT: umov w13, v0.h[2]
; CHECK-GI-NEXT: fmov w12, s1
; CHECK-GI-NEXT: cmp w8, w10, uxth
; CHECK-GI-NEXT: uxth w8, w11
; CHECK-GI-NEXT: fmov w10, s2
; CHECK-GI-NEXT: csel w9, w9, w12, lo
; CHECK-GI-NEXT: cmp w8, w9, uxth
; CHECK-GI-NEXT: csel w0, w9, w10, hi
; CHECK-GI-NEXT: cmp w8, w12, uxth
; CHECK-GI-NEXT: csel w8, w9, w10, lo
; CHECK-GI-NEXT: cmp w11, w8, uxth
; CHECK-GI-NEXT: csel w0, w8, w13, hi
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
Expand Down Expand Up @@ -1625,19 +1607,16 @@ define i8 @umaxv_v4i8(<4 x i8> %a) {
; CHECK-GI-LABEL: umaxv_v4i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: mov h3, v0.h[3]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: fmov w12, s3
; CHECK-GI-NEXT: and w9, w8, #0xff
; CHECK-GI-NEXT: cmp w9, w10, uxtb
; CHECK-GI-NEXT: and w9, w11, #0xff
; CHECK-GI-NEXT: csel w8, w8, w10, hi
; CHECK-GI-NEXT: cmp w9, w12, uxtb
; CHECK-GI-NEXT: csel w9, w11, w12, hi
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[1]
; CHECK-GI-NEXT: umov w10, v0.h[2]
; CHECK-GI-NEXT: umov w11, v0.h[3]
; CHECK-GI-NEXT: and w12, w8, #0xff
; CHECK-GI-NEXT: cmp w12, w9, uxtb
; CHECK-GI-NEXT: and w12, w10, #0xff
; CHECK-GI-NEXT: csel w8, w8, w9, hi
; CHECK-GI-NEXT: cmp w12, w11, uxtb
; CHECK-GI-NEXT: csel w9, w10, w11, hi
; CHECK-GI-NEXT: and w10, w8, #0xff
; CHECK-GI-NEXT: cmp w10, w9, uxtb
; CHECK-GI-NEXT: csel w0, w8, w9, hi
Expand Down Expand Up @@ -1719,19 +1698,16 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: uxth w8, w8
; CHECK-GI-NEXT: fmov w10, s1
; CHECK-GI-NEXT: fmov w11, s2
; CHECK-GI-NEXT: umov w8, v0.h[0]
; CHECK-GI-NEXT: umov w9, v0.h[0]
; CHECK-GI-NEXT: umov w10, v0.h[1]
; CHECK-GI-NEXT: umov w11, v0.h[2]
; CHECK-GI-NEXT: umov w13, v0.h[2]
; CHECK-GI-NEXT: fmov w12, s1
; CHECK-GI-NEXT: cmp w8, w10, uxth
; CHECK-GI-NEXT: uxth w8, w11
; CHECK-GI-NEXT: fmov w10, s2
; CHECK-GI-NEXT: csel w9, w9, w12, hi
; CHECK-GI-NEXT: cmp w8, w9, uxth
; CHECK-GI-NEXT: csel w0, w9, w10, lo
; CHECK-GI-NEXT: cmp w8, w12, uxth
; CHECK-GI-NEXT: csel w8, w9, w10, hi
; CHECK-GI-NEXT: cmp w11, w8, uxth
; CHECK-GI-NEXT: csel w0, w8, w13, lo
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)
Expand Down
Loading