Skip to content

Commit baab4aa

Browse files
committed
[VectorCombine] convert scalar fneg with insert/extract to vector fneg
insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> shuffle DestVec, (fneg SrcVec), Mask This is a specialized form of what could be a more general fold for a binop. It's also possible that fneg is overlooked by SLP in this kind of insert/extract pattern since it's a unary op. This shows up in the motivating example from #issue 58139, but it won't solve it (that probably requires some x86-specific backend changes). There are also some small enhancements (see TODO comments) that can be done as follow-up patches. Differential Revision: https://reviews.llvm.org/D135278
1 parent 6ace81d commit baab4aa

File tree

2 files changed

+109
-19
lines changed

2 files changed

+109
-19
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "llvm/Support/CommandLine.h"
3131
#include "llvm/Transforms/Utils/Local.h"
3232
#include "llvm/Transforms/Vectorize.h"
33+
#include <numeric>
3334

3435
#define DEBUG_TYPE "vector-combine"
3536
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -97,6 +98,7 @@ class VectorCombine {
9798
void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
9899
Instruction &I);
99100
bool foldExtractExtract(Instruction &I);
101+
bool foldInsExtFNeg(Instruction &I);
100102
bool foldBitcastShuf(Instruction &I);
101103
bool scalarizeBinopOrCmp(Instruction &I);
102104
bool foldExtractedCmps(Instruction &I);
@@ -533,6 +535,67 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
533535
return true;
534536
}
535537

538+
/// Try to replace an extract + scalar fneg + insert with a vector fneg +
539+
/// shuffle.
540+
bool VectorCombine::foldInsExtFNeg(Instruction &I) {
541+
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
542+
if (!VecTy)
543+
return false;
544+
545+
// Match an insert (op (extract)) pattern.
546+
Value *DestVec;
547+
uint64_t Index;
548+
Instruction *FNeg;
549+
if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
550+
m_ConstantInt(Index))))
551+
return false;
552+
553+
Value *SrcVec;
554+
if (!match(FNeg, m_FNeg(m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))
555+
return false;
556+
557+
if (SrcVec->getType() != VecTy)
558+
return false;
559+
560+
// Ignore bogus insert/extract index.
561+
unsigned NumElts = VecTy->getNumElements();
562+
if (Index >= NumElts)
563+
return false;
564+
565+
// We are inserting the negated element into the same lane that we extracted
566+
// from. This is equivalent to a select-shuffle that chooses all but the
567+
// negated element from the destination vector.
568+
SmallVector<int> Mask(NumElts);
569+
std::iota(Mask.begin(), Mask.end(), 0);
570+
Mask[Index] = Index + NumElts;
571+
572+
Type *ScalarTy = VecTy->getScalarType();
573+
InstructionCost OldCost =
574+
TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
575+
TTI.getVectorInstrCost(I, VecTy, Index);
576+
577+
// If the extract has one use, it will be eliminated, so count it in the
578+
// original cost. If it has more than one use, ignore the cost because it will
579+
// be the same before/after.
580+
Instruction *Extract = cast<Instruction>(FNeg->getOperand(0));
581+
if (Extract->hasOneUse())
582+
OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index);
583+
584+
InstructionCost NewCost =
585+
TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
586+
TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
587+
588+
if (NewCost > OldCost)
589+
return false;
590+
591+
// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
592+
// shuffle DestVec, (fneg SrcVec), Mask
593+
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
594+
Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
595+
replaceValue(I, *Shuf);
596+
return true;
597+
}
598+
536599
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
537600
/// destination type followed by shuffle. This can enable further transforms by
538601
/// moving bitcasts or shuffles together.
@@ -1571,6 +1634,7 @@ bool VectorCombine::run() {
15711634
if (!ScalarizationOnly) {
15721635
MadeChange |= vectorizeLoadInsert(I);
15731636
MadeChange |= foldExtractExtract(I);
1637+
MadeChange |= foldInsExtFNeg(I);
15741638
MadeChange |= foldBitcastShuf(I);
15751639
MadeChange |= foldExtractedCmps(I);
15761640
MadeChange |= foldShuffleOfBinops(I);

llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK
3-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK
2+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
3+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
44

55
declare void @use(float)
66

7+
; TODO: The insert is costed as free, so creating a shuffle appears to be a loss.
8+
79
define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
810
; CHECK-LABEL: @ext0_v4f32(
911
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
@@ -21,9 +23,8 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
2123

2224
define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
2325
; CHECK-LABEL: @ext2_v4f32(
24-
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
25-
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
26-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2
26+
; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
27+
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2728
; CHECK-NEXT: ret <4 x float> [[R]]
2829
;
2930
%e = extractelement <4 x float> %x, i32 2
@@ -36,9 +37,8 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
3637

3738
define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
3839
; CHECK-LABEL: @ext1_v2f64(
39-
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
40-
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
41-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
40+
; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
41+
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
4242
; CHECK-NEXT: ret <2 x double> [[R]]
4343
;
4444
%e = extractelement <2 x double> %x, i32 1
@@ -47,26 +47,43 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
4747
ret <2 x double> %r
4848
}
4949

50+
; The vector fneg would cost twice as much as the scalar op with SSE,
51+
; so we don't transform there (the shuffle would also be more expensive).
52+
5053
define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
51-
; CHECK-LABEL: @ext7_v8f32(
52-
; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
53-
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
54-
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
55-
; CHECK-NEXT: ret <8 x float> [[R]]
54+
; SSE-LABEL: @ext7_v8f32(
55+
; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
56+
; SSE-NEXT: [[N:%.*]] = fneg float [[E]]
57+
; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
58+
; SSE-NEXT: ret <8 x float> [[R]]
59+
;
60+
; AVX-LABEL: @ext7_v8f32(
61+
; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
62+
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
63+
; AVX-NEXT: ret <8 x float> [[R]]
5664
;
5765
%e = extractelement <8 x float> %x, i32 7
5866
%n = fneg float %e
5967
%r = insertelement <8 x float> %y, float %n, i32 7
6068
ret <8 x float> %r
6169
}
6270

71+
; Same as above with an extra use of the extracted element.
72+
6373
define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
64-
; CHECK-LABEL: @ext7_v8f32_use1(
65-
; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
66-
; CHECK-NEXT: call void @use(float [[E]])
67-
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
68-
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
69-
; CHECK-NEXT: ret <8 x float> [[R]]
74+
; SSE-LABEL: @ext7_v8f32_use1(
75+
; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
76+
; SSE-NEXT: call void @use(float [[E]])
77+
; SSE-NEXT: [[N:%.*]] = fneg float [[E]]
78+
; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
79+
; SSE-NEXT: ret <8 x float> [[R]]
80+
;
81+
; AVX-LABEL: @ext7_v8f32_use1(
82+
; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
83+
; AVX-NEXT: call void @use(float [[E]])
84+
; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
85+
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 6, i32 7>
86+
; AVX-NEXT: ret <8 x float> [[R]]
7087
;
7188
%e = extractelement <8 x float> %x, i32 5
7289
call void @use(float %e)
@@ -75,6 +92,8 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
7592
ret <8 x float> %r
7693
}
7794

95+
; Negative test - the transform is likely not profitable if the fneg has another use.
96+
7897
define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
7998
; CHECK-LABEL: @ext7_v8f32_use2(
8099
; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 3
@@ -90,6 +109,8 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
90109
ret <8 x float> %r
91110
}
92111

112+
; Negative test - can't convert variable index to a shuffle.
113+
93114
define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
94115
; CHECK-LABEL: @ext_index_var_v2f64(
95116
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
@@ -103,6 +124,9 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
103124
ret <2 x double> %r
104125
}
105126

127+
; Negative test - require same extract/insert index for simple shuffle.
128+
; TODO: We could handle this by adjusting the cost calculation.
129+
106130
define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
107131
; CHECK-LABEL: @ext1_v2f64_ins0(
108132
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
@@ -116,6 +140,8 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
116140
ret <2 x double> %r
117141
}
118142

143+
; Negative test - avoid changing poison ops
144+
119145
define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
120146
; CHECK-LABEL: @ext12_v4f32(
121147
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 12

0 commit comments

Comments
 (0)