-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Lower vXi8 multiplies using PMADDUBSW on SSSE3+ targets #95690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesExtends #95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention. Fixes #90748 Patch is 273.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95690.diff 18 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f27c935812f51..02af650d69c75 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28506,17 +28506,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- // For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension.
+ // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
// Don't do this if we only need to unpack one half.
- if (Subtarget.hasSSSE3() &&
- ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- bool IsLoLaneAllZeroOrUndef = true;
- bool IsHiLaneAllZeroOrUndef = true;
- for (auto [Idx, Val] : enumerate(B->ops())) {
- if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
- IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
- else
- IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ if (Subtarget.hasSSSE3()) {
+ bool BIsBuildVector = isa<BuildVectorSDNode>(B);
+ bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
+ bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
+ if (BIsBuildVector) {
+ for (auto [Idx, Val] : enumerate(B->ops())) {
+ if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
+ IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ else
+ IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
+ }
}
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
@@ -28531,6 +28533,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
}
+
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 64cacd74153fe..dd97c1f590a31 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -852,8 +852,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
{ ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
- { ISD::MUL, MVT::v32i8, { 6, 11,10,11 } }, // extend/pmullw/trunc
- { ISD::MUL, MVT::v64i8, { 6, 12,10,11 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
+ { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
@@ -1119,7 +1119,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
{ ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
- { ISD::MUL, MVT::v32i8, { 6, 11,10,20 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
@@ -1170,8 +1170,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v32i8, { 12, 12, 22, 23 } }, // unpack/pmullw + split
- { ISD::MUL, MVT::v16i8, { 5, 6, 10, 12 } }, // unpack/pmullw
+ { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
+ { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
@@ -1311,7 +1311,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, { 6, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
};
@@ -1320,6 +1319,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
+ static const CostKindTblEntry SSSE3CostTable[] = {
+ { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
+ };
+
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return LT.first * *KindCost;
+
static const CostKindTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index 050beb7fc25a3..f5ad65817950e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -791,9 +791,9 @@ define i32 @mul(i32 %arg) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; SSE42-LABEL: 'mul'
@@ -835,9 +835,9 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = mul <16 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -858,8 +858,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -880,7 +880,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -902,8 +902,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -924,7 +924,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
index 6cf278e98bd85..ed58f0f554e23 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index b5ca132d8c51d..c9ee064822636 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX2-LABEL: 'mul'
@@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'mul'
@@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) {
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
@@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) {
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'mul'
@@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) {
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef
...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
ping? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Extends llvm#95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention. Fixes llvm#90748
ac4cb9e
to
4d90c89
Compare
…5690) Extends llvm#95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets benefit from performing this for non-constant cases - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention (but lower instruction count). Fixes llvm#90748
Extends #95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together.
Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention (but lower instruction count).
Fixes #90748