Skip to content

[InstCombine] Fold shuffles through all trivially vectorizable intrinsics #141979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

52 changes: 30 additions & 22 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1400,42 +1400,46 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
/// try to shuffle after the intrinsic.
Instruction *
InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
// TODO: This should be extended to handle other intrinsics like fshl, ctpop,
// etc. Use llvm::isTriviallyVectorizable() and related to determine
// which intrinsics are safe to shuffle?
switch (II->getIntrinsicID()) {
case Intrinsic::smax:
case Intrinsic::smin:
case Intrinsic::umax:
case Intrinsic::umin:
case Intrinsic::fma:
case Intrinsic::fshl:
case Intrinsic::fshr:
break;
default:
if (!isTriviallyVectorizable(II->getIntrinsicID()) ||
!II->getCalledFunction()->isSpeculatable())
return nullptr;

// fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so
// avoid undoing it.
if (match(II, m_FAbs(m_Value())))
return nullptr;
}

Value *X;
Constant *C;
ArrayRef<int> Mask;
auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>);
auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) {
return isa<Constant>(Arg.get()) ||
isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
Arg.getOperandNo(), nullptr);
});
if (!NonConstArg ||
!match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask))))
return nullptr;

// At least 1 operand must have 1 use because we are creating 2 instructions.
if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); }))
// At least 1 operand must be a shuffle with 1 use because we are creating 2
// instructions.
if (none_of(II->args(), [](Value *V) {
return isa<ShuffleVectorInst>(V) && V->hasOneUse();
}))
return nullptr;

// See if all arguments are shuffled with the same mask.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pre-existing, but the one use check above should be about the shuffle operands only, right? It doesn't help us if a constant argument is one-use (which it always is).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I went poking around though and it looks like constants don't return true for hasOneUse(). They seem to show up as 0 uses, so it happens to work today by coincidence.

Will fix anyway since I think with this patch we also need to worry about non-constant scalar args.

SmallVector<Value *, 4> NewArgs;
Type *SrcTy = X->getType();
for (Value *Arg : II->args()) {
if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
X->getType() == SrcTy)
for (Use &Arg : II->args()) {
if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
Arg.getOperandNo(), nullptr))
NewArgs.push_back(Arg);
else if (match(&Arg,
m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
X->getType() == SrcTy)
NewArgs.push_back(X);
else if (match(Arg, m_ImmConstant(C))) {
else if (match(&Arg, m_ImmConstant(C))) {
// If it's a constant, try find the constant that would be shuffled to C.
if (Constant *ShuffledC =
unshuffleConstant(Mask, C, cast<VectorType>(SrcTy)))
Expand All @@ -1448,8 +1452,12 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {

// intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M
Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr;
// Result type might be a different vector width.
// TODO: Check that the result type isn't widened?
VectorType *ResTy =
VectorType::get(II->getType()->getScalarType(), cast<VectorType>(SrcTy));
Value *NewIntrinsic =
Builder.CreateIntrinsic(II->getIntrinsicID(), SrcTy, NewArgs, FPI);
Builder.CreateIntrinsic(ResTy, II->getIntrinsicID(), NewArgs, FPI);
return new ShuffleVectorInst(NewIntrinsic, Mask);
}

Expand Down
11 changes: 11 additions & 0 deletions llvm/test/Transforms/InstCombine/abs-1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -978,3 +978,14 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
%cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab
ret i32 %cond
}

define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
; CHECK-LABEL: @abs_unary_shuffle_ops(
; CHECK-NEXT: [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false)
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x i32> [[R]]
;
%a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, what happens if you add an extra use to this shufflevector? I think it will fold thanks to the i1 false argument.

%r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
ret <2 x i32> %r
}
13 changes: 13 additions & 0 deletions llvm/test/Transforms/InstCombine/fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,19 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %sub
}

define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]])
; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x float> [[R1]]
;
%a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}

; negative tests

define half @fma_non_negone(half %x, half %y) {
Expand Down
15 changes: 15 additions & 0 deletions llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2511,6 +2511,21 @@ define <3 x i8> @smin_unary_shuffle_ops_uses(<3 x i8> %x, <3 x i8> %y) {
ret <3 x i8> %r
}

; negative test - too many uses

define <3 x i8> @smin_unary_shuffle_ops_uses_const(<3 x i8> %x, <3 x i8> %y) {
; CHECK-LABEL: @smin_unary_shuffle_ops_uses_const(
; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
; CHECK-NEXT: call void @use_vec(<3 x i8> [[SX]])
; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[SX]], <3 x i8> <i8 1, i8 2, i8 3>)
; CHECK-NEXT: ret <3 x i8> [[R]]
;
%sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
call void @use_vec(<3 x i8> %sx)
%r = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %sx, <3 x i8> <i8 1, i8 2, i8 3>)
ret <3 x i8> %r
}

; This would assert/crash because we tried to zext to i1.

@g = external dso_local global i32, align 4
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/Transforms/InstCombine/powi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -564,3 +564,29 @@ define double @powi_fmul_powi_x_overflow(double noundef %x) {
%mul = fmul reassoc double %p1, %x
ret double %mul
}

define <3 x float> @powi_unary_shuffle_ops(<3 x float> %x, i32 %power) {
; CHECK-LABEL: @powi_unary_shuffle_ops(
; CHECK-NEXT: [[TMP1:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[X:%.*]], i32 [[POWER:%.*]])
; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
; CHECK-NEXT: ret <3 x float> [[R]]
;
%sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
%r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
ret <3 x float> %r
}

; Negative test - multiple uses

define <3 x float> @powi_unary_shuffle_ops_use(<3 x float> %x, i32 %power, ptr %p) {
; CHECK-LABEL: @powi_unary_shuffle_ops_use(
; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x float> [[X:%.*]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
; CHECK-NEXT: store <3 x float> [[SX]], ptr [[P:%.*]], align 16
; CHECK-NEXT: [[R:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[SX]], i32 [[POWER:%.*]])
; CHECK-NEXT: ret <3 x float> [[R]]
;
%sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
store <3 x float> %sx, ptr %p
%r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
ret <3 x float> %r
}
13 changes: 13 additions & 0 deletions llvm/test/Transforms/InstCombine/scmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,19 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) {
ret i8 %r
}

define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops(
; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i2> @llvm.scmp.v3i2.v3i8(<3 x i8> [[X]], <3 x i8> [[Y]])
; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x i2> [[TMP1]], <3 x i2> poison, <3 x i32> <i32 1, i32 0, i32 2>
; CHECK-NEXT: ret <3 x i2> [[R]]
;
%sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
%sy = shufflevector <3 x i8> %y, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
%r = call <3 x i2> @llvm.scmp(<3 x i8> %sx, <3 x i8> %sy)
ret <3 x i2> %r
}

; Negative test: true value of outer select is not zero
define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) {
; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1(
Expand Down
11 changes: 11 additions & 0 deletions llvm/test/Transforms/InstCombine/sqrt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
ret <2 x float> %res
}

define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
; CHECK-LABEL: @sqrt_unary_shuffle_ops(
; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]])
; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x float> [[R1]]
;
%a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%r = call <2 x float> @llvm.sqrt(<2 x float> %a)
ret <2 x float> %r
}

declare i32 @foo(double)
declare double @sqrt(double) readnone
declare float @sqrtf(float)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,23 +264,17 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fold changes the vector length of intrinsics.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's a bit strange, but I guess that was the existing behaviour. Here's a test case in test/InstCombine/fma.ll:

define <2 x float> @fma_unary_shuffle_ops_narrowing(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
; CHECK-LABEL: @fma_unary_shuffle_ops_narrowing(
; CHECK-NEXT:    [[B:%.*]] = shufflevector <3 x float> [[Y:%.*]], <3 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT:    call void @use_vec(<2 x float> [[B]])
; CHECK-NEXT:    [[TMP1:%.*]] = call nnan nsz <3 x float> @llvm.fma.v3f32(<3 x float> [[X:%.*]], <3 x float> [[Y]], <3 x float> [[Z:%.*]])
; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT:    ret <2 x float> [[R]]
;
  %a = shufflevector <3 x float> %x, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  %b = shufflevector <3 x float> %y, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  call void @use_vec(<2 x float> %b)
  %c = shufflevector <3 x float> %z, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  %r = call nnan nsz <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
  ret <2 x float> %r
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to avoid this fold when shufflevectors change the type (as a follow-up). cc @RKSimon

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should still allow the case where we combine into a smaller type, e.g.

define <3 x float> @fma_unary_shuffle_ops_widening(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
; CHECK-LABEL: @fma_unary_shuffle_ops_widening(
; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
; CHECK-NEXT:    call void @use_vec3(<3 x float> [[A]])
; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fma.v2f32(<2 x float> [[X]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]])
; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
; CHECK-NEXT:    ret <3 x float> [[R]]
;
  %a = shufflevector <2 x float> %x, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  call void @use_vec3(<3 x float> %a)
  %b = shufflevector <2 x float> %y, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  %c = shufflevector <2 x float> %z, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  %r = call fast <3 x float> @llvm.fma.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
  ret <3 x float> %r
}

I agree either way, this would make it more consistent with the binop combine in InstCombinerImpl::foldVectorBinop which currently only allows the same type for two non-constant operands. For a constant operand, it looks like it's allowed to narrow it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we should not be doing length changes without cost modeling...

; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
Expand Down Expand Up @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
Expand Down
30 changes: 10 additions & 20 deletions llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -264,23 +264,17 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
Expand Down Expand Up @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
Expand Down
Loading
Loading