Skip to content

[VectorCombine] Combine scalar fneg with insert/extract to vector fneg when length is different #115209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -665,9 +665,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
return false;

// TODO: We could handle this with a length-changing shuffle.
auto *VecTy = cast<FixedVectorType>(I.getType());
if (SrcVec->getType() != VecTy)
auto *ScalarTy = VecTy->getScalarType();
auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
return false;

// Ignore bogus insert/extract index.
Expand All @@ -681,8 +682,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
SmallVector<int> Mask(NumElts);
std::iota(Mask.begin(), Mask.end(), 0);
Mask[Index] = Index + NumElts;

Type *ScalarTy = VecTy->getScalarType();
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
Expand All @@ -697,14 +696,33 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);

bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
// If the lengths of the two vectors are not equal,
// we need to add a length-change vector. Add this cost.
SmallVector<int> SrcMask;
if (NeedLenChg) {
SrcMask.assign(NumElts, PoisonMaskElem);
SrcMask[Index] = Index;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
SrcVecTy, SrcMask, CostKind);
}

if (NewCost > OldCost)
return false;

// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
// shuffle DestVec, (fneg SrcVec), Mask
Value *NewShuf;
// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
replaceValue(I, *Shuf);
if (NeedLenChg) {
// shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ParkHanbum - this is regression typo - sorry I missed this :(

Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry for my mistake.. Can I send a PR again?

NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
} else {
// shuffle DestVec, (fneg SrcVec), Mask
NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
}

replaceValue(I, *NewShuf);
return true;
}

Expand Down
154 changes: 154 additions & 0 deletions llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %r
}

define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext0_v2f32v4f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <2 x float> %x, i32 0
%n = fneg float %e
%r = insertelement <4 x float> %y, float %n, i32 0
ret <4 x float> %r
}

; Eliminating extract/insert is profitable.

define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
Expand All @@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %r
}

define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext2_v2f32v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <2 x float> %x, i32 2
%n = fneg float %e
%r = insertelement <4 x float> %y, float %n, i32 2
ret <4 x float> %r
}

; Eliminating extract/insert is still profitable. Flags propagate.

define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
Expand All @@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}

define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
; SSE-LABEL: @ext1_v2f64v4f64(
; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
; SSE-NEXT: ret <4 x double> [[R]]
;
; AVX-LABEL: @ext1_v2f64v4f64(
; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
; AVX-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
%r = insertelement <4 x double> %y, double %n, i32 1
ret <4 x double> %r
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please can you add coverage from when you extract from an index greater than the vector width of the destination:

  %e = extractelement <4 x double> %x, i32 3
  %n = fneg nsz double %e
  %r = insertelement <2 x double> %y, double %n, i32 1
  ret <4 x double> %r

Copy link
Contributor Author

@ParkHanbum ParkHanbum Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done. but this test case didn't work as properly because we've restrict of extract/insert index to be same.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine - I just want to make sure we have coverage for cases where the extraction index is out of bounds for the destination vector


; The vector fneg would cost twice as much as the scalar op with SSE,
; so we don't transform there (the shuffle would also be more expensive).

Expand All @@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
; CHECK-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
%n = fneg float %e
%r = insertelement <8 x float> %y, float %n, i32 7
ret <8 x float> %r
}

; Same as above with an extra use of the extracted element.

define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
Expand All @@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32_use1(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; CHECK-NEXT: call void @use(float [[E]])
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
; CHECK-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
call void @use(float %e)
%n = fneg float %e
%r = insertelement <8 x float> %y, float %n, i32 3
ret <8 x float> %r
}

; Negative test - the transform is likely not profitable if the fneg has another use.

define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
Expand All @@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32_use2(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: call void @use(float [[N]])
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
; CHECK-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
%n = fneg float %e
call void @use(float %n)
%r = insertelement <8 x float> %y, float %n, i32 3
ret <8 x float> %r
}

; Negative test - can't convert variable index to a shuffle.

define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
Expand All @@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
ret <2 x double> %r
}

define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) {
; CHECK-LABEL: @ext_index_var_v2f64v4f64(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
; CHECK-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 %index
%n = fneg nsz double %e
%r = insertelement <4 x double> %y, double %n, i32 %index
ret <4 x double> %r
}

; Negative test - require same extract/insert index for simple shuffle.
; TODO: We could handle this by adjusting the cost calculation.

Expand All @@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}

; Negative test - extract from an index greater than the vector width of the destination
define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext3_v4f64v2f64(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <4 x double> %x, i32 3
%n = fneg nsz double %e
%r = insertelement <2 x double> %y, double %n, i32 1
ret <2 x double> %r
}

define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) {
; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
; CHECK-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
%r = insertelement <4 x double> %y, double %n, i32 0
ret <4 x double> %r
}

; Negative test - avoid changing poison ops

define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
Expand All @@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
ret <4 x float> %r
}

define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext12_v2f32v4f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <2 x float> %x, i32 6
%n = fneg float %e
%r = insertelement <4 x float> %y, float %n, i32 12
ret <4 x float> %r
}

; This used to crash because we assumed matching a true, unary fneg instruction.

define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) {
Expand Down Expand Up @@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
%r = insertelement <2 x float> %y, float %s, i32 1
ret <2 x float> %r
}

define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <2 x float> %x, i32 1
%s = fsub nsz nnan float 0.0, %e
%r = insertelement <4 x float> %y, float %s, i32 1
ret <4 x float> %r
}
Loading