-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesPrevious patch did not pass the list of the extract indices by Patch is 31.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96808.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 974f966d46e81..5d6ddb787ac3c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8304,20 +8304,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
// FIXME: this must be moved to TTI for better estimation.
unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
- auto CheckPerRegistersShuffle =
- [&](MutableArrayRef<int> Mask,
- SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+ auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
+ SmallVectorImpl<unsigned> &Indices)
+ -> std::optional<TTI::ShuffleKind> {
if (NumElts <= EltsPerVector)
return std::nullopt;
+ int OffsetReg0 =
+ alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
+ [](int S, int I) {
+ if (I == PoisonMaskElem)
+ return S;
+ return std::min(S, I);
+ }),
+ EltsPerVector);
+ int OffsetReg1 = OffsetReg0;
DenseSet<int> RegIndices;
// Check that if trying to permute same single/2 input vectors.
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
int FirstRegId = -1;
- Indices.assign(1, -1);
- for (int &I : Mask) {
+ Indices.assign(1, OffsetReg0);
+ for (auto [Pos, I] : enumerate(Mask)) {
if (I == PoisonMaskElem)
continue;
- int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+ int Idx = I - OffsetReg0;
+ int RegId =
+ (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
if (FirstRegId < 0)
FirstRegId = RegId;
RegIndices.insert(RegId);
@@ -8325,14 +8336,25 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return std::nullopt;
if (RegIndices.size() == 2) {
ShuffleKind = TTI::SK_PermuteTwoSrc;
- if (Indices.size() == 1)
- Indices.push_back(-1);
+ if (Indices.size() == 1) {
+ OffsetReg1 = alignDown(
+ std::accumulate(
+ std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
+ [&](int S, int I) {
+ if (I == PoisonMaskElem)
+ return S;
+ int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
+ ((I - OffsetReg0) % NumElts) / EltsPerVector;
+ if (RegId == FirstRegId)
+ return S;
+ return std::min(S, I);
+ }),
+ EltsPerVector);
+ Indices.push_back(OffsetReg1);
+ }
+ Idx = I - OffsetReg1;
}
- if (RegId == FirstRegId)
- Indices.front() = I % NumElts;
- else
- Indices.back() = I % NumElts;
- I = (I % NumElts) % EltsPerVector +
+ I = (Idx % NumElts) % EltsPerVector +
(RegId == FirstRegId ? 0 : EltsPerVector);
}
return ShuffleKind;
@@ -8349,7 +8371,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
copy(MaskSlice, SubMask.begin());
- SmallVector<int> Indices;
+ SmallVector<unsigned, 2> Indices;
std::optional<TTI::ShuffleKind> RegShuffleKind =
CheckPerRegistersShuffle(SubMask, Indices);
if (!RegShuffleKind) {
@@ -8367,12 +8389,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
FixedVectorType::get(ScalarTy, EltsPerVector),
SubMask);
}
- for (int Idx : Indices) {
+ for (unsigned Idx : Indices) {
Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
FixedVectorType::get(ScalarTy, NumElts),
std::nullopt, CostKind, Idx,
FixedVectorType::get(ScalarTy, EltsPerVector));
}
+ // Second attempt to check, if just a permute is better estimated than
+ // subvector extract.
+ SubMask.assign(NumElts, PoisonMaskElem);
+ copy(MaskSlice, SubMask.begin());
+ InstructionCost OriginalCost =
+ ::getShuffleCost(TTI, *ShuffleKinds[Part],
+ FixedVectorType::get(ScalarTy, NumElts), SubMask);
+ if (OriginalCost < Cost)
+ Cost = OriginalCost;
}
return Cost;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 9608608a18098..059e4c38b519b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,11 +155,13 @@ define <4 x float> @exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; DEFAULT-LABEL: define <4 x float> @exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -171,11 +173,13 @@ define <4 x float> @exp_4x(ptr %a) {
; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -208,11 +212,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; DEFAULT-LABEL: define <4 x float> @int_exp_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -224,11 +230,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -261,11 +269,13 @@ define <4 x float> @log_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; DEFAULT-LABEL: define <4 x float> @log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -277,11 +287,13 @@ define <4 x float> @log_4x(ptr %a) {
; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -314,11 +326,13 @@ define <4 x float> @int_log_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; DEFAULT-LABEL: define <4 x float> @int_log_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -330,11 +344,13 @@ define <4 x float> @int_log_4x(ptr %a) {
; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -367,11 +383,13 @@ define <4 x float> @sin_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; DEFAULT-LABEL: define <4 x float> @sin_4x
; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -383,11 +401,13 @@ define <4 x float> @sin_4x(ptr %a) {
; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
@@ -420,11 +440,13 @@ define <4 x float> @int_sin_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = e...
[truncated]
|
if (I == PoisonMaskElem) | ||
return S; | ||
int RegId = ((I - OffsetReg0) / NumElts) * NumParts + | ||
((I - OffsetReg0) % NumElts) / EltsPerVector; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this doing? Don't we just need the PartIdx?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We analyze the remaining part of the mask, trying to find the indices from the second register. But we still may have the indices from the first register. So, it calculates the register id for the remaining indices to find the minimal index only(!) from this second register.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Previous patch did not pass the list of the extract indices by reference, so the compiler just ignored them. Pass indices by reference and fix the per-register analysis. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #96808
Previous patch did not pass the list of the extract indices by reference, so the compiler just ignored them. Pass indices by reference and fix the per-register analysis. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#96808
Previous patch did not pass the list of the extract indices by reference, so the compiler just ignored them. Pass indices by reference and fix the per-register analysis. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#96808
Previous patch did not pass the list of the extract indices by
reference, so the compiler just ignored them. Pass indices by reference
and fix the per-register analysis.