Skip to content

Commit ecbbe5b

Browse files
[SLP]Fix mask building for alternate node cost estimation (#102966)
Need to to use same functionality in cost model, as for the codegen, to correctly build the shuffle mask and estimate the cost.
1 parent b368404 commit ecbbe5b

File tree

3 files changed

+36
-44
lines changed

3 files changed

+36
-44
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10135,13 +10135,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1013510135
}
1013610136
SmallVector<int> Mask;
1013710137
E->buildAltOpShuffleMask(
10138-
[E](Instruction *I) {
10138+
[&](Instruction *I) {
1013910139
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
10140-
return I->getOpcode() == E->getAltOpcode();
10140+
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
10141+
*TLI);
1014110142
},
1014210143
Mask);
1014310144
VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
10144-
FinalVecTy, Mask);
10145+
FinalVecTy, Mask, CostKind);
1014510146
// Patterns like [fadd,fsub] can be combined into a single instruction
1014610147
// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
1014710148
// need to take into account their order when looking for the most used

llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -86,16 +86,17 @@ return:
8686
define float @test_merge_anyof_v4sf(<4 x float> %t) {
8787
; CHECK-LABEL: @test_merge_anyof_v4sf(
8888
; CHECK-NEXT: entry:
89-
; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]]
90-
; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
91-
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
92-
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]]
93-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
94-
; CHECK-NEXT: [[OP_RDX_NOT:%.*]] = icmp eq i4 [[TMP3]], 0
95-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
96-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[T_FR]], [[SHIFT]]
97-
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0
98-
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OP_RDX_NOT]], float [[ADD]], float 0.000000e+00
89+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
90+
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
91+
; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
92+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
93+
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
94+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
95+
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
96+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
97+
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[SHIFT]], [[T]]
98+
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
99+
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00
99100
; CHECK-NEXT: ret float [[RETVAL_0]]
100101
;
101102
entry:
@@ -400,16 +401,18 @@ return:
400401
define float @test_merge_anyof_v4si(<4 x i32> %t) {
401402
; CHECK-LABEL: @test_merge_anyof_v4si(
402403
; CHECK-NEXT: entry:
403-
; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
404-
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[T_FR]], <i32 -256, i32 -256, i32 -256, i32 -256>
405-
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], <i32 -255, i32 -255, i32 -255, i32 -255>
406-
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
407-
; CHECK-NEXT: [[OP_RDX_NOT:%.*]] = icmp eq i4 [[TMP2]], 0
408-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
409-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]]
410-
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
404+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
405+
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
406+
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
407+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
408+
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
409+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
410+
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
411+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
412+
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]]
413+
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0
411414
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float
412-
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OP_RDX_NOT]], float [[CONV]], float 0.000000e+00
415+
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00
413416
; CHECK-NEXT: ret float [[RETVAL_0]]
414417
;
415418
entry:

llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -388,28 +388,16 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
388388
}
389389

390390
define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
391-
; SSE-LABEL: @logical_and_icmp_clamp_pred_diff(
392-
; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
393-
; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
394-
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
395-
; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
396-
; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
397-
; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
398-
; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
399-
; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
400-
; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
401-
; SSE-NEXT: ret i1 [[OP_RDX]]
402-
;
403-
; AVX-LABEL: @logical_and_icmp_clamp_pred_diff(
404-
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
405-
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
406-
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 15>
407-
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
408-
; AVX-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
409-
; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
410-
; AVX-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
411-
; AVX-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
412-
; AVX-NEXT: ret i1 [[TMP8]]
391+
; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
392+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
393+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
394+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 15>
395+
; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
396+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
397+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
398+
; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
399+
; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
400+
; CHECK-NEXT: ret i1 [[TMP8]]
413401
;
414402
%x0 = extractelement <4 x i32> %x, i32 0
415403
%x1 = extractelement <4 x i32> %x, i32 1

0 commit comments

Comments
 (0)