Skip to content

Commit db88071

Browse files
authored
[CostModel][X86] Attempt to match cheap v4f32 shuffles that map to SHUFPS instruction (llvm#121778)
Avoid always assuming the worst for v4f32 2 input shuffles, and match the SHUFPS pattern where possible - each pair of output elements must come from the same source register.
1 parent cb5d866 commit db88071

14 files changed

+186
-489
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2226,9 +2226,18 @@ InstructionCost X86TTIImpl::getShuffleCost(
22262226
{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
22272227
};
22282228

2229-
if (ST->hasSSE1())
2229+
if (ST->hasSSE1()) {
2230+
if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2231+
// SHUFPS: both pairs must come from the same source register.
2232+
auto MatchSHUFPS = [](int X, int Y) {
2233+
return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2234+
};
2235+
if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2236+
return 1;
2237+
}
22302238
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
22312239
return LT.first * Entry->Cost;
2240+
}
22322241

22332242
return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
22342243
}

llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -247,21 +247,9 @@ define <4 x float> @test_v4f32_2(<4 x float> %a, <4 x float> %b) {
247247
}
248248

249249
define <4 x float> @test_v4f32_3(<4 x float> %a, <4 x float> %b) {
250-
; SSE2-LABEL: 'test_v4f32_3'
251-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
252-
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
253-
;
254-
; SSSE3-LABEL: 'test_v4f32_3'
255-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
256-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
257-
;
258-
; SSE42-LABEL: 'test_v4f32_3'
259-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
260-
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
261-
;
262-
; AVX-LABEL: 'test_v4f32_3'
263-
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
264-
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
250+
; CHECK-LABEL: 'test_v4f32_3'
251+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
252+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
265253
;
266254
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
267255
ret <4 x float> %1

llvm/test/Analysis/CostModel/X86/reduction.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -926,8 +926,8 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
926926

927927
define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
928928
; SSE2-LABEL: 'pairwise_reduction8float'
929-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
930-
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
929+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
930+
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
931931
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
932932
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
933933
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -939,8 +939,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
939939
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
940940
;
941941
; SSSE3-LABEL: 'pairwise_reduction8float'
942-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
943-
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
942+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
943+
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
944944
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
945945
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
946946
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -952,8 +952,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
952952
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
953953
;
954954
; SSE42-LABEL: 'pairwise_reduction8float'
955-
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
956-
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
955+
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
956+
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
957957
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
958958
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
959959
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -991,8 +991,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
991991
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
992992
;
993993
; SLM-LABEL: 'pairwise_reduction8float'
994-
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
995-
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
994+
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
995+
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
996996
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
997997
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
998998
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

0 commit comments

Comments
 (0)