-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Try to keep operand of external casts as scalars, if profitable #110537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Try to keep operand of external casts as scalars, if profitable #110537
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesIf the cost of original scalar instruction + cast is better than the Patch is 54.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110537.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e45fcb2b5c790c..72fa37bd849bd9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11563,6 +11563,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
+ SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
for (ExternalUser &EU : ExternalUses) {
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -11698,7 +11699,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Can use original instruction, if no operands vectorized or they are
// marked as externally used already.
auto *Inst = cast<Instruction>(EU.Scalar);
- bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+ auto OperandIsScalar = [&](Value *V) {
if (!getTreeEntry(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
@@ -11708,9 +11710,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return true;
}
return ValueToExtUses->contains(V);
- });
+ };
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
+ bool CanBeUsedAsScalarCast = false;
+ if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
+ if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
+ Op && all_of(Op->operands(), OperandIsScalar)) {
+ InstructionCost OpCost = TTI->getInstructionCost(Op, CostKind);
+ if (ScalarCost + OpCost <= ExtraCost) {
+ CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
+ ScalarCost += OpCost;
+ }
+ }
+ }
if (CanBeUsedAsScalar) {
- InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
bool KeepScalar = ScalarCost <= ExtraCost;
// Try to keep original scalar if the user is the phi node from the same
// block as the root phis, currently vectorized. It allows to keep
@@ -11766,12 +11779,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtraCost = ScalarCost;
if (!IsPhiInLoop(EU))
ExtractsCount[Entry].insert(Inst);
+ if (CanBeUsedAsScalarCast)
+ ScalarOpsFromCasts.insert(Inst->getOperand(0));
}
}
}
ExtractCost += ExtraCost;
}
+ // Insert externals for extract of operands of casts to be emitted as scalars
+ // instead of extractelement.
+ for (Value *V : ScalarOpsFromCasts) {
+ ExternalUsesAsOriginalScalar.insert(V);
+ ExternalUses.emplace_back(V, nullptr, getTreeEntry(V)->findLaneForValue(V));
+ }
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
const TreeEntry &Root = *VectorizableTree.front();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..b38c636ccaf5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -54,38 +54,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP192:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP192]], [[TMP22]]
-; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP40]], [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT: [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT: [[TMP31:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; CHECK-NEXT: [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
+; CHECK-NEXT: [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
+; CHECK-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
@@ -94,52 +99,55 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP76:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
+; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
+; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT: [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
+; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
+; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT: [[TMP84:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -147,60 +155,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT: [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1...
[truncated]
|
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Created using spr 1.3.5
If the cost of original scalar instruction + cast is better than the extractelement from the vector cast instruction, better to keep original scalar instructions, where possible Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm/llvm-project#110537
If the cost of original scalar instruction + cast is better than the extractelement from the vector cast instruction, better to keep original scalar instructions, where possible Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#110537
If the cost of original scalar instruction + cast is better than the
extractelement from the vector cast instruction, better to keep original
scalar instructions, where possible