Skip to content

[DAGCombiner] Add support for scalarising extracts of a vector setcc #117566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 27 additions & 16 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22749,16 +22749,22 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,

/// Transform a vector binary operation into a scalar binary operation by moving
/// the math/logic after an extract element of a vector.
static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
const SDLoc &DL, bool LegalOperations) {
static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
const SDLoc &DL, bool LegalTypes) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||
if (!IndexC || !Vec.hasOneUse() || (Opc != ISD::SETCC && !TLI.isBinOp(Opc)) ||

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

auto *IndexC = dyn_cast<ConstantSDNode>(Index);
if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
unsigned Opc = Vec.getOpcode();
if (!IndexC || !Vec.hasOneUse() || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) ||
Vec->getNumValues() != 1)
return SDValue();

EVT ResVT = ExtElt->getValueType(0);
if (Opc == ISD::SETCC &&
(ResVT != Vec.getValueType().getVectorElementType() || LegalTypes))
return SDValue();

// Targets may want to avoid this to prevent an expensive register transfer.
if (!TLI.shouldScalarizeBinop(Vec))
return SDValue();
Expand All @@ -22769,19 +22775,24 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
SDValue Op0 = Vec.getOperand(0);
SDValue Op1 = Vec.getOperand(1);
APInt SplatVal;
if (isAnyConstantBuildVector(Op0, true) ||
ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
isAnyConstantBuildVector(Op1, true) ||
ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
// extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
// extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
EVT VT = ExtElt->getValueType(0);
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
}
if (!isAnyConstantBuildVector(Op0, true) &&
!ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
!isAnyConstantBuildVector(Op1, true) &&
!ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
return SDValue();

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the combine occurs after type legalisation it's possible for OpVT to be a non-legal type despite Op0 being type legal.

Perhaps it's worth passing in LegalTypes (instead of the original LegalOperations) to be checked as part of the Opc == ISD::SETCC early exit?

The stock binop case works because it assumes the operation is independent of the element type, which is not true for comparisons.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

return SDValue();
// extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
EVT OpVT = Op0->getValueType(0).getVectorElementType();
EVT OpVT = Op0.getValueType().getVectorElementType();

// extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
if (Opc == ISD::SETCC) {
EVT OpVT = Op0.getValueType().getVectorElementType();
Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No else after return

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
return DAG.getSetCC(DL, ResVT, Op0, Op1,
cast<CondCodeSDNode>(Vec->getOperand(2))->get());
}
Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op0, Index);
Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op1, Index);
return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
}

// Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
Expand Down Expand Up @@ -23014,7 +23025,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
}
}

if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL, LegalTypes))
return BO;

if (VecVT.isScalableVector())
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMinimumJumpTableEntries() const override;

bool softPromoteHalfType() const override { return true; }

bool shouldScalarizeBinop(SDValue VecOp) const override {
return VecOp.getOpcode() == ISD::SETCC;
}
};

namespace AArch64 {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
if (Opc >= ISD::BUILTIN_OP_END)
if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
return false;

// If the vector op is not supported, try to convert to scalar.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
if (Opc >= ISD::BUILTIN_OP_END)
if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
return false;

// If the vector op is not supported, try to convert to scalar.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3306,7 +3306,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
if (Opc >= ISD::BUILTIN_OP_END)
if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
return false;

// If the vector op is not supported, try to convert to scalar.
Expand Down
46 changes: 24 additions & 22 deletions llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)

define fastcc i8 @allocno_reload_assign() {
define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-LABEL: allocno_reload_assign:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, xzr
Expand All @@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() {
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s
; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h
; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
; CHECK-NEXT: uzp1 p8.b, p0.b, p0.b
; CHECK-NEXT: mov z0.b, p8/z, #1 // =0x1
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: mov z0.b, #0 // =0x0
; CHECK-NEXT: uunpklo z1.h, z0.b
Expand All @@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() {
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: punpkhi p3.h, p1.b
; CHECK-NEXT: punpkhi p4.h, p1.b
; CHECK-NEXT: uunpklo z0.d, z2.s
; CHECK-NEXT: uunpkhi z1.d, z2.s
; CHECK-NEXT: punpklo p5.h, p0.b
; CHECK-NEXT: punpklo p6.h, p0.b
; CHECK-NEXT: uunpklo z2.d, z3.s
; CHECK-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEXT: punpkhi p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: uunpklo z4.d, z5.s
; CHECK-NEXT: uunpkhi z5.d, z5.s
; CHECK-NEXT: uunpklo z6.d, z7.s
; CHECK-NEXT: uunpkhi z7.d, z7.s
; CHECK-NEXT: punpklo p0.h, p2.b
; CHECK-NEXT: punpkhi p1.h, p2.b
; CHECK-NEXT: punpklo p2.h, p3.b
; CHECK-NEXT: punpkhi p3.h, p3.b
; CHECK-NEXT: punpklo p4.h, p5.b
; CHECK-NEXT: punpkhi p5.h, p5.b
; CHECK-NEXT: punpklo p6.h, p7.b
; CHECK-NEXT: punpkhi p7.h, p7.b
; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: punpkhi p2.h, p2.b
; CHECK-NEXT: punpklo p3.h, p4.b
; CHECK-NEXT: punpkhi p4.h, p4.b
; CHECK-NEXT: punpklo p5.h, p6.b
; CHECK-NEXT: punpkhi p6.h, p6.b
; CHECK-NEXT: punpklo p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: st1b { z0.d }, p0, [z16.d]
; CHECK-NEXT: st1b { z1.d }, p1, [z16.d]
; CHECK-NEXT: st1b { z2.d }, p2, [z16.d]
; CHECK-NEXT: st1b { z3.d }, p3, [z16.d]
; CHECK-NEXT: st1b { z4.d }, p4, [z16.d]
; CHECK-NEXT: st1b { z5.d }, p5, [z16.d]
; CHECK-NEXT: st1b { z6.d }, p6, [z16.d]
; CHECK-NEXT: st1b { z7.d }, p7, [z16.d]
; CHECK-NEXT: st1b { z0.d }, p1, [z16.d]
; CHECK-NEXT: st1b { z1.d }, p2, [z16.d]
; CHECK-NEXT: st1b { z2.d }, p3, [z16.d]
; CHECK-NEXT: st1b { z3.d }, p4, [z16.d]
; CHECK-NEXT: st1b { z4.d }, p5, [z16.d]
; CHECK-NEXT: st1b { z5.d }, p6, [z16.d]
; CHECK-NEXT: st1b { z6.d }, p7, [z16.d]
; CHECK-NEXT: st1b { z7.d }, p0, [z16.d]
; CHECK-NEXT: str p8, [x0]
; CHECK-NEXT: b .LBB0_1
br label %1

Expand All @@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() {
%constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
store <vscale x 16 x i1> %constexpr, ptr %p, align 16
br label %1
}

Expand Down
186 changes: 186 additions & 0 deletions llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sve < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"


define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: cmp w8, #5
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%ext = extractelement <4 x i1> %icmp, i32 1
ret i1 %ext
}

define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: cmp w8, #7
; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> splat(i32 7), %a
%ext = extractelement <4 x i1> %icmp, i32 1
ret i1 %ext
}

define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: cmp w8, #234
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
%ext = extractelement <4 x i1> %icmp, i32 1
ret i1 %ext
}

define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
; CHECK: // %bb.0:
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: fmov s1, #4.00000000
; CHECK-NEXT: fcmp s0, s1
; CHECK-NEXT: cset w0, lt
; CHECK-NEXT: ret
%fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
%ext = extractelement <4 x i1> %fcmp, i32 1
ret i1 %ext
}

define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
; CHECK-LABEL: vector_loop_with_icmp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: index z0.d, #0, #1
; CHECK-NEXT: mov w8, #2 // =0x2
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: add x8, x0, #4
; CHECK-NEXT: mov w10, #1 // =0x1
; CHECK-NEXT: b .LBB4_2
; CHECK-NEXT: .LBB4_1: // %pred.store.continue6
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-NEXT: subs x9, x9, #2
; CHECK-NEXT: add x8, x8, #8
; CHECK-NEXT: b.eq .LBB4_6
; CHECK-NEXT: .LBB4_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: cmp x11, #14
; CHECK-NEXT: b.hi .LBB4_4
; CHECK-NEXT: // %bb.3: // %pred.store.if
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: stur w10, [x8, #-4]
; CHECK-NEXT: .LBB4_4: // %pred.store.continue
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: mov x11, v0.d[1]
; CHECK-NEXT: cmp x11, #14
; CHECK-NEXT: b.hi .LBB4_1
; CHECK-NEXT: // %bb.5: // %pred.store.if5
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: str w10, [x8]
; CHECK-NEXT: b .LBB4_1
; CHECK-NEXT: .LBB4_6: // %for.cond.cleanup
; CHECK-NEXT: ret
entry:
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue6 ]
%vec.ind = phi <2 x i64> [ <i64 0, i64 1>, %entry ], [ %vec.ind.next, %pred.store.continue6 ]
%vec.cmp = icmp ult <2 x i64> %vec.ind, <i64 15, i64 15>
%c0 = extractelement <2 x i1> %vec.cmp, i64 0
br i1 %c0, label %pred.store.if, label %pred.store.continue

pred.store.if:
%arrayidx = getelementptr inbounds i32, ptr %dest, i64 %index
store i32 1, ptr %arrayidx, align 4
br label %pred.store.continue

pred.store.continue:
%c1 = extractelement <2 x i1> %vec.cmp, i64 1
br i1 %c1, label %pred.store.if5, label %pred.store.continue6

pred.store.if5:
%indexp1 = or disjoint i64 %index, 1
%arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %indexp1
store i32 1, ptr %arrayidx2, align 4
br label %pred.store.continue6

pred.store.continue6:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function really big for a dag combine test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it was probably too big. I was trying to highlight the motivating example from LoopVectorize.cpp and demonstrate the improvement. I've reduced the test to use a <2 x i64> instead to reduce the size. Hopefully this is small enough now!

I've also tried to properly name all the variables.

%index.next = add i64 %index, 2
%vec.ind.next = add <2 x i64> %vec.ind, <i64 2, i64 2>
%index.cmp = icmp eq i64 %index.next, 16
br i1 %index.cmp, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}


; Negative tests

define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v1.4s, w0
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: umov w8, v0.h[1]
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%ins = insertelement <4 x i32> poison, i32 %b, i32 0
%splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
%icmp = icmp ult <4 x i32> %a, %splat
%ext = extractelement <4 x i1> %icmp, i32 1
ret i1 %ext
}

define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #235
; CHECK-NEXT: adrp x9, .LCPI6_0
; CHECK-NEXT: mov x8, x0
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_0]
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v1.4h, v0.4s
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: umov w9, v1.h[1]
; CHECK-NEXT: fmov w10, s0
; CHECK-NEXT: and w0, w9, #0x1
; CHECK-NEXT: strb w10, [x8]
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, splat(i32 235)
%ext = extractelement <4 x i1> %icmp, i32 1
store <4 x i1> %icmp, ptr %p, align 4
ret i1 %ext
}

define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movi v1.4s, #127
; CHECK-NEXT: add x8, sp, #8
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: bfi x8, x0, #1, #2
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: ldrh w8, [x8]
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, splat(i32 127)
%ext = extractelement <4 x i1> %icmp, i32 %c
ret i1 %ext
}
Loading
Loading