-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VectorCombine] Combine scalar fneg with insert/extract to vector fneg when length is different #115209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] Combine scalar fneg with insert/extract to vector fneg when length is different #115209
Changes from all commits
57bed71
8c69f20
3327572
66750f6
5c694ff
78fd2c4
f490844
d1d1af2
2883558
dba637d
014b186
7a58cb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -665,9 +665,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { | |
m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) | ||
return false; | ||
|
||
// TODO: We could handle this with a length-changing shuffle. | ||
auto *VecTy = cast<FixedVectorType>(I.getType()); | ||
if (SrcVec->getType() != VecTy) | ||
auto *ScalarTy = VecTy->getScalarType(); | ||
auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType()); | ||
if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) | ||
return false; | ||
|
||
// Ignore bogus insert/extract index. | ||
|
@@ -681,8 +682,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { | |
SmallVector<int> Mask(NumElts); | ||
std::iota(Mask.begin(), Mask.end(), 0); | ||
Mask[Index] = Index + NumElts; | ||
|
||
Type *ScalarTy = VecTy->getScalarType(); | ||
InstructionCost OldCost = | ||
TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + | ||
TTI.getVectorInstrCost(I, VecTy, CostKind, Index); | ||
|
@@ -697,14 +696,33 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { | |
TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + | ||
TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); | ||
|
||
bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; | ||
// If the lengths of the two vectors are not equal, | ||
// we need to add a length-change vector. Add this cost. | ||
SmallVector<int> SrcMask; | ||
if (NeedLenChg) { | ||
SrcMask.assign(NumElts, PoisonMaskElem); | ||
SrcMask[Index] = Index; | ||
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, | ||
SrcVecTy, SrcMask, CostKind); | ||
} | ||
|
||
if (NewCost > OldCost) | ||
return false; | ||
|
||
// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> | ||
// shuffle DestVec, (fneg SrcVec), Mask | ||
Value *NewShuf; | ||
// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index | ||
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); | ||
Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); | ||
replaceValue(I, *Shuf); | ||
if (NeedLenChg) { | ||
// shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask | ||
Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ParkHanbum - this is regression typo - sorry I missed this :(
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sorry for my mistake.. Can I send a PR again? |
||
NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); | ||
} else { | ||
// shuffle DestVec, (fneg SrcVec), Mask | ||
NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); | ||
} | ||
|
||
replaceValue(I, *NewShuf); | ||
return true; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { | |
ret <4 x float> %r | ||
} | ||
|
||
define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { | ||
; CHECK-LABEL: @ext0_v2f32v4f32( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 | ||
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 | ||
; CHECK-NEXT: ret <4 x float> [[R]] | ||
; | ||
%e = extractelement <2 x float> %x, i32 0 | ||
%n = fneg float %e | ||
%r = insertelement <4 x float> %y, float %n, i32 0 | ||
ret <4 x float> %r | ||
} | ||
|
||
; Eliminating extract/insert is profitable. | ||
|
||
define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { | ||
|
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { | |
ret <4 x float> %r | ||
} | ||
|
||
define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { | ||
; CHECK-LABEL: @ext2_v2f32v4f32( | ||
; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison> | ||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3> | ||
; CHECK-NEXT: ret <4 x float> [[R]] | ||
; | ||
%e = extractelement <2 x float> %x, i32 2 | ||
%n = fneg float %e | ||
%r = insertelement <4 x float> %y, float %n, i32 2 | ||
ret <4 x float> %r | ||
} | ||
|
||
; Eliminating extract/insert is still profitable. Flags propagate. | ||
|
||
define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { | ||
|
@@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { | |
ret <2 x double> %r | ||
} | ||
|
||
define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { | ||
; SSE-LABEL: @ext1_v2f64v4f64( | ||
; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 | ||
; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] | ||
; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 | ||
; SSE-NEXT: ret <4 x double> [[R]] | ||
; | ||
; AVX-LABEL: @ext1_v2f64v4f64( | ||
; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] | ||
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> | ||
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> | ||
; AVX-NEXT: ret <4 x double> [[R]] | ||
; | ||
%e = extractelement <2 x double> %x, i32 1 | ||
%n = fneg nsz double %e | ||
%r = insertelement <4 x double> %y, double %n, i32 1 | ||
ret <4 x double> %r | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please can you add coverage from when you extract from an index greater than the vector width of the destination:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. but this test case didn't work as properly because we've restrict of extract/insert index to be same. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's fine - I just want to make sure we have coverage for cases where the extraction index is out of bounds for the destination vector |
||
|
||
; The vector fneg would cost twice as much as the scalar op with SSE, | ||
; so we don't transform there (the shuffle would also be more expensive). | ||
|
||
|
@@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { | |
ret <8 x float> %r | ||
} | ||
|
||
define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { | ||
; CHECK-LABEL: @ext7_v4f32v8f32( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 | ||
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 | ||
; CHECK-NEXT: ret <8 x float> [[R]] | ||
; | ||
%e = extractelement <4 x float> %x, i32 3 | ||
%n = fneg float %e | ||
%r = insertelement <8 x float> %y, float %n, i32 7 | ||
ret <8 x float> %r | ||
} | ||
|
||
; Same as above with an extra use of the extracted element. | ||
|
||
define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { | ||
|
@@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { | |
ret <8 x float> %r | ||
} | ||
|
||
define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { | ||
; CHECK-LABEL: @ext7_v4f32v8f32_use1( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 | ||
; CHECK-NEXT: call void @use(float [[E]]) | ||
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 | ||
; CHECK-NEXT: ret <8 x float> [[R]] | ||
; | ||
%e = extractelement <4 x float> %x, i32 3 | ||
call void @use(float %e) | ||
%n = fneg float %e | ||
%r = insertelement <8 x float> %y, float %n, i32 3 | ||
ret <8 x float> %r | ||
} | ||
|
||
; Negative test - the transform is likely not profitable if the fneg has another use. | ||
|
||
define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { | ||
|
@@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { | |
ret <8 x float> %r | ||
} | ||
|
||
define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { | ||
; CHECK-LABEL: @ext7_v4f32v8f32_use2( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 | ||
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] | ||
; CHECK-NEXT: call void @use(float [[N]]) | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 | ||
; CHECK-NEXT: ret <8 x float> [[R]] | ||
; | ||
%e = extractelement <4 x float> %x, i32 3 | ||
%n = fneg float %e | ||
call void @use(float %n) | ||
%r = insertelement <8 x float> %y, float %n, i32 3 | ||
ret <8 x float> %r | ||
} | ||
|
||
; Negative test - can't convert variable index to a shuffle. | ||
|
||
define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { | ||
|
@@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 % | |
ret <2 x double> %r | ||
} | ||
|
||
define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { | ||
; CHECK-LABEL: @ext_index_var_v2f64v4f64( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] | ||
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] | ||
; CHECK-NEXT: ret <4 x double> [[R]] | ||
; | ||
%e = extractelement <2 x double> %x, i32 %index | ||
%n = fneg nsz double %e | ||
%r = insertelement <4 x double> %y, double %n, i32 %index | ||
ret <4 x double> %r | ||
} | ||
|
||
; Negative test - require same extract/insert index for simple shuffle. | ||
; TODO: We could handle this by adjusting the cost calculation. | ||
|
||
|
@@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { | |
ret <2 x double> %r | ||
} | ||
|
||
; Negative test - extract from an index greater than the vector width of the destination | ||
define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { | ||
; CHECK-LABEL: @ext3_v4f64v2f64( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 | ||
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 | ||
; CHECK-NEXT: ret <2 x double> [[R]] | ||
; | ||
%e = extractelement <4 x double> %x, i32 3 | ||
%n = fneg nsz double %e | ||
%r = insertelement <2 x double> %y, double %n, i32 1 | ||
ret <2 x double> %r | ||
} | ||
|
||
define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { | ||
; CHECK-LABEL: @ext1_v2f64v4f64_ins0( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 | ||
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 | ||
; CHECK-NEXT: ret <4 x double> [[R]] | ||
; | ||
%e = extractelement <2 x double> %x, i32 1 | ||
%n = fneg nsz double %e | ||
%r = insertelement <4 x double> %y, double %n, i32 0 | ||
ret <4 x double> %r | ||
} | ||
|
||
; Negative test - avoid changing poison ops | ||
|
||
define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { | ||
|
@@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { | |
ret <4 x float> %r | ||
} | ||
|
||
define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { | ||
; CHECK-LABEL: @ext12_v2f32v4f32( | ||
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 | ||
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] | ||
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 | ||
; CHECK-NEXT: ret <4 x float> [[R]] | ||
; | ||
%e = extractelement <2 x float> %x, i32 6 | ||
%n = fneg float %e | ||
%r = insertelement <4 x float> %y, float %n, i32 12 | ||
ret <4 x float> %r | ||
} | ||
|
||
; This used to crash because we assumed matching a true, unary fneg instruction. | ||
|
||
define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { | ||
|
@@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { | |
%r = insertelement <2 x float> %y, float %s, i32 1 | ||
ret <2 x float> %r | ||
} | ||
|
||
define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { | ||
; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( | ||
; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> | ||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> | ||
; CHECK-NEXT: ret <4 x float> [[R]] | ||
; | ||
%e = extractelement <2 x float> %x, i32 1 | ||
%s = fsub nsz nnan float 0.0, %e | ||
%r = insertelement <4 x float> %y, float %s, i32 1 | ||
ret <4 x float> %r | ||
} |
Uh oh!
There was an error while loading. Please reload this page.