-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Split rr/rm CVT schedules on SNB/HSW/BDW #117494
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The folded load variants almost never require Port5 for length changing conversions (just for SNB ymm cases), and don't have an extra uop for the load. Confirmed with a mixture of Agner + uops.info comparisons.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThe folded load variants almost never require Port5 for length changing conversions (just for SNB ymm cases), and don't have an extra uop for the load. Confirmed with a mixture of Agner + uops.info comparisons. Patch is 48.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117494.diff 15 Files Affected:
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 699ca91cd1f8f4..e5b3cc4b6c90e6 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -367,21 +367,26 @@ defm : BWWriteResPair<WriteCvtPD2IY, [BWPort1,BWPort5], 6, [1,1], 2, 6>;
defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
defm : X86WriteRes<WriteCvtI2SS, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PS, [BWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtI2PSY, [BWPort1], 3, [1], 1>;
defm : X86WriteRes<WriteCvtI2SSLd, [BWPort1,BWPort23], 9, [1,1], 2>;
-defm : BWWriteResPair<WriteCvtI2PS, [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtI2PSY, [BWPort1], 3, [1], 1, 6>;
+defm : X86WriteRes<WriteCvtI2PSLd, [BWPort1,BWPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PSYLd, [BWPort1,BWPort23], 9, [1,1], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
defm : X86WriteRes<WriteCvtI2SD, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PD, [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDY, [BWPort1,BWPort5], 6, [1,1], 2>;
defm : X86WriteRes<WriteCvtI2SDLd, [BWPort1,BWPort23], 9, [1,1], 2>;
-defm : BWWriteResPair<WriteCvtI2PD, [BWPort1,BWPort5], 4, [1,1], 2, 5>;
-defm : BWWriteResPair<WriteCvtI2PDY, [BWPort1,BWPort5], 6, [1,1], 2, 5>;
+defm : X86WriteRes<WriteCvtI2PDLd, [BWPort1,BWPort23], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDYLd, [BWPort1,BWPort23],11, [1,1], 2>;
defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
defm : X86WriteRes<WriteCvtSS2SD, [BWPort0,BWPort5], 2, [1,1], 2>;
-defm : X86WriteRes<WriteCvtSS2SDLd, [BWPort0,BWPort23], 6, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PD, [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDY, [BWPort0,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtSS2SDLd, [BWPort0,BWPort23], 6, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PDLd, [BWPort0,BWPort23], 6, [1,1], 2>;
-defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort0,BWPort5], 4, [1,1], 2, 5>;
+defm : X86WriteRes<WriteCvtPS2PDYLd, [BWPort0,BWPort23], 9, [1,1], 2>;
defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
defm : BWWriteResPair<WriteCvtSD2SS, [BWPort1,BWPort5], 4, [1,1], 2, 5>;
defm : BWWriteResPair<WriteCvtPD2PS, [BWPort1,BWPort5], 4, [1,1], 2, 5>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index b820418bb55191..59874be34f5a28 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -364,22 +364,30 @@ defm : HWWriteResPair<WriteCvtPS2IY, [HWPort1], 3, [1], 1, 7>;
defm : HWWriteResPair<WriteCvtPS2IZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
defm : X86WriteRes<WriteCvtI2SD, [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PD, [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDY, [HWPort1,HWPort5], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDZ, [HWPort1,HWPort5], 6, [1,1], 2>; // Unsupported = 1
defm : X86WriteRes<WriteCvtI2SDLd, [HWPort1,HWPort23], 9, [1,1], 2>;
-defm : HWWriteResPair<WriteCvtI2PD, [HWPort1,HWPort5], 4, [1,1], 2, 6>;
-defm : HWWriteResPair<WriteCvtI2PDY, [HWPort1,HWPort5], 6, [1,1], 2, 6>;
-defm : HWWriteResPair<WriteCvtI2PDZ, [HWPort1,HWPort5], 6, [1,1], 2, 6>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2PDLd, [HWPort1,HWPort23],10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDYLd, [HWPort1,HWPort23],12, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PDZLd, [HWPort1,HWPort23],12, [1,1], 2>; // Unsupported = 1
defm : X86WriteRes<WriteCvtI2SS, [HWPort1,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PS, [HWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtI2PSY, [HWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteCvtI2PSZ, [HWPort1], 3, [1], 1>; // Unsupported = 1
defm : X86WriteRes<WriteCvtI2SSLd, [HWPort1,HWPort23], 9, [1,1], 2>;
-defm : HWWriteResPair<WriteCvtI2PS, [HWPort1], 3, [1], 1, 6>;
-defm : HWWriteResPair<WriteCvtI2PSY, [HWPort1], 3, [1], 1, 7>;
-defm : HWWriteResPair<WriteCvtI2PSZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtI2PSLd, [HWPort1,HWPort23], 9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PSYLd, [HWPort1,HWPort23],10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtI2PSZLd, [HWPort1,HWPort23],10, [1,1], 2>; // Unsupported = 1
defm : X86WriteRes<WriteCvtSS2SD, [HWPort0,HWPort5], 2, [1,1], 2>;
-defm : X86WriteRes<WriteCvtSS2SDLd, [HWPort0,HWPort23], 7, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PD, [HWPort0,HWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDY, [HWPort0,HWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZ, [HWPort0,HWPort5], 4, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtSS2SDLd, [HWPort0,HWPort23], 7, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PDLd, [HWPort0,HWPort23], 6, [1,1], 2>;
-defm : HWWriteResPair<WriteCvtPS2PDY, [HWPort0,HWPort5], 4, [1,1], 2, 6>;
-defm : HWWriteResPair<WriteCvtPS2PDZ, [HWPort0,HWPort5], 4, [1,1], 2, 6>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PDYLd, [HWPort0,HWPort23],10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PDZLd, [HWPort0,HWPort23],10, [1,1], 2>; // Unsupported = 1
defm : HWWriteResPair<WriteCvtSD2SS, [HWPort1,HWPort5], 4, [1,1], 2, 5>;
defm : HWWriteResPair<WriteCvtPD2PS, [HWPort1,HWPort5], 4, [1,1], 2, 6>;
defm : HWWriteResPair<WriteCvtPD2PSY, [HWPort1,HWPort5], 6, [1,1], 2, 6>;
@@ -983,7 +991,6 @@ def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
let NumMicroOps = 2;
let ReleaseAtCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSrm)>;
def: InstRW<[HWWriteResGroup12], (instregex "P(DEP|EXT)(32|64)rm")>;
def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
@@ -1349,13 +1356,6 @@ def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
}
def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>;
-def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ReleaseAtCycles = [1,1,1];
-}
-def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDrm)>;
-
def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
let Latency = 9;
let NumMicroOps = 3;
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7be9f51bcd46bd..6939b1227d0a61 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -348,13 +348,14 @@ defm : X86WriteRes<WriteCvtI2PDLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>
defm : X86WriteRes<WriteCvtI2PDYLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>;
defm : X86WriteRes<WriteCvtI2PDZLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; // Unsupported = 1
-defm : SBWriteResPair<WriteCvtSS2SD, [SBPort0], 1, [1], 1, 6>;
+defm : X86WriteRes<WriteCvtSS2SD, [SBPort0,SBPort5], 1, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PD, [SBPort0,SBPort5], 2, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PDY, [SBPort0,SBPort5], 2, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PDZ, [SBPort0,SBPort5], 2, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtSS2SDLd, [SBPort0,SBPort23], 7, [1,1], 2>;
defm : X86WriteRes<WriteCvtPS2PDLd, [SBPort0,SBPort23], 7, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort23], 7, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort23], 7, [1,1], 2>; // Unsupported = 1
+defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort5,SBPort23], 7, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort5,SBPort23], 7, [1,1,1], 3>; // Unsupported = 1
defm : SBWriteResPair<WriteCvtSD2SS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
defm : SBWriteResPair<WriteCvtPD2PS, [SBPort1,SBPort5], 4, [1,1], 2, 6>;
defm : SBWriteResPair<WriteCvtPD2PSY, [SBPort1,SBPort5], 4, [1,1], 2, 7>;
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
index df0053a1dcb9b5..25f79397fa071d 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
@@ -448,7 +448,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 cvtsi2sd %rcx, %xmm2
# CHECK-NEXT: 2 9 1.00 * cvtsi2sdl (%rax), %xmm2
# CHECK-NEXT: 2 9 1.00 * cvtsi2sdq (%rax), %xmm2
-# CHECK-NEXT: 1 1 1.00 cvtss2sd %xmm0, %xmm2
+# CHECK-NEXT: 2 1 1.00 cvtss2sd %xmm0, %xmm2
# CHECK-NEXT: 2 7 1.00 * cvtss2sd (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 cvttpd2dq %xmm0, %xmm2
# CHECK-NEXT: 3 10 1.00 * cvttpd2dq (%rax), %xmm2
@@ -687,7 +687,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
-# CHECK-NEXT: - 172.00 75.83 117.33 17.00 101.83 67.00 67.00
+# CHECK-NEXT: - 172.00 75.83 117.33 17.00 102.83 67.00 67.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
@@ -732,7 +732,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - - cvtsi2sd %rcx, %xmm2
# CHECK-NEXT: - - - 1.00 - - 0.50 0.50 cvtsi2sdl (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - 0.50 0.50 cvtsi2sdq (%rax), %xmm2
-# CHECK-NEXT: - - 1.00 - - - - - cvtss2sd %xmm0, %xmm2
+# CHECK-NEXT: - - 1.00 - - 1.00 - - cvtss2sd %xmm0, %xmm2
# CHECK-NEXT: - - 1.00 - - - 0.50 0.50 cvtss2sd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - - cvttpd2dq %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 0.50 0.50 cvttpd2dq (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index 1b196b4355a6d4..028625013a85cc 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1115,9 +1115,9 @@ vzeroupper
# CHECK-NEXT: 1 3 1.00 vcomiss %xmm0, %xmm1
# CHECK-NEXT: 2 8 1.00 * vcomiss (%rax), %xmm1
# CHECK-NEXT: 2 4 1.00 vcvtdq2pd %xmm0, %xmm2
-# CHECK-NEXT: 3 9 1.00 * vcvtdq2pd (%rax), %xmm2
+# CHECK-NEXT: 2 9 1.00 * vcvtdq2pd (%rax), %xmm2
# CHECK-NEXT: 2 6 1.00 vcvtdq2pd %xmm0, %ymm2
-# CHECK-NEXT: 3 11 1.00 * vcvtdq2pd (%rax), %ymm2
+# CHECK-NEXT: 2 11 1.00 * vcvtdq2pd (%rax), %ymm2
# CHECK-NEXT: 1 3 1.00 vcvtdq2ps %xmm0, %xmm2
# CHECK-NEXT: 2 8 1.00 * vcvtdq2ps (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 vcvtdq2ps %ymm0, %ymm2
@@ -1137,7 +1137,7 @@ vzeroupper
# CHECK-NEXT: 2 2 1.00 vcvtps2pd %xmm0, %xmm2
# CHECK-NEXT: 2 6 1.00 * vcvtps2pd (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 vcvtps2pd %xmm0, %ymm2
-# CHECK-NEXT: 3 9 1.00 * vcvtps2pd (%rax), %ymm2
+# CHECK-NEXT: 2 9 1.00 * vcvtps2pd (%rax), %ymm2
# CHECK-NEXT: 2 4 1.00 vcvtsd2si %xmm0, %ecx
# CHECK-NEXT: 2 4 1.00 vcvtsd2si %xmm0, %rcx
# CHECK-NEXT: 3 9 1.00 * vcvtsd2si (%rax), %ecx
@@ -1736,7 +1736,7 @@ vzeroupper
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - 257.00 216.25 247.25 173.17 173.17 38.00 424.25 3.25 12.67
+# CHECK-NEXT: - 257.00 216.25 247.25 173.17 173.17 38.00 421.25 3.25 12.67
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -1825,9 +1825,9 @@ vzeroupper
# CHECK-NEXT: - - - 1.00 - - - - - - vcomiss %xmm0, %xmm1
# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcomiss (%rax), %xmm1
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtdq2pd %xmm0, %xmm2
-# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtdq2pd (%rax), %xmm2
+# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvtdq2pd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtdq2pd %xmm0, %ymm2
-# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtdq2pd (%rax), %ymm2
+# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvtdq2pd (%rax), %ymm2
# CHECK-NEXT: - - - 1.00 - - - - - - vcvtdq2ps %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvtdq2ps (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - - - vcvtdq2ps %ymm0, %ymm2
@@ -1847,7 +1847,7 @@ vzeroupper
# CHECK-NEXT: - - 1.00 - - - - 1.00 - - vcvtps2pd %xmm0, %xmm2
# CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - vcvtps2pd (%rax), %xmm2
# CHECK-NEXT: - - 1.00 - - - - 1.00 - - vcvtps2pd %xmm0, %ymm2
-# CHECK-NEXT: - - 1.00 - 0.50 0.50 - 1.00 - - vcvtps2pd (%rax), %ymm2
+# CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - vcvtps2pd (%rax), %ymm2
# CHECK-NEXT: - - 1.00 1.00 - - - - - - vcvtsd2si %xmm0, %ecx
# CHECK-NEXT: - - 1.00 1.00 - - - - - - vcvtsd2si %xmm0, %rcx
# CHECK-NEXT: - - 1.00 1.00 0.50 0.50 - - - - vcvtsd2si (%rax), %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse2.s
index e76d90521afa9c..8851be4679a1e9 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse2.s
@@ -423,7 +423,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 comisd %xmm0, %xmm1
# CHECK-NEXT: 2 8 1.00 * comisd (%rax), %xmm1
# CHECK-NEXT: 2 4 1.00 cvtdq2pd %xmm0, %xmm2
-# CHECK-NEXT: 3 9 1.00 * cvtdq2pd (%rax), %xmm2
+# CHECK-NEXT: 2 9 1.00 * cvtdq2pd (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 cvtdq2ps %xmm0, %xmm2
# CHECK-NEXT: 2 8 1.00 * cvtdq2ps (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 cvtpd2dq %xmm0, %xmm2
@@ -433,7 +433,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 cvtpd2ps %xmm0, %xmm2
# CHECK-NEXT: 3 9 1.00 * cvtpd2ps (%rax), %xmm2
# CHECK-NEXT: 2 4 1.00 cvtpi2pd %mm0, %xmm2
-# CHECK-NEXT: 3 9 1.00 * cvtpi2pd (%rax), %xmm2
+# CHECK-NEXT: 2 9 1.00 * cvtpi2pd (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 cvtps2dq %xmm0, %xmm2
# CHECK-NEXT: 2 8 1.00 * cvtps2dq (%rax), %xmm2
# CHECK-NEXT: 2 2 1.00 cvtps2pd %xmm0, %xmm2
@@ -689,7 +689,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
-# CHECK-NEXT: - 78.00 70.75 95.75 63.17 63.17 14.00 119.25 2.25 4.67
+# CHECK-NEXT: - 78.00 70.75 95.75 63.17 63.17 14.00 117.25 2.25 4.67
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
@@ -709,7 +709,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - - - comisd %xmm0, %xmm1
# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - comisd (%rax), %xmm1
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - cvtdq2pd %xmm0, %xmm2
-# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - cvtdq2pd (%rax), %xmm2
+# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - cvtdq2pd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - - - cvtdq2ps %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - cvtdq2ps (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - cvtpd2dq %xmm0, %xmm2
@@ -719,7 +719,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - cvtpd2ps %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - cvtpd2ps (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - cvtpi2pd %mm0, %xmm2
-# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - cvtpi2pd (%rax), %xmm2
+# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - cvtpi2pd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - - - cvtps2dq %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - cvtps2dq (%rax), %xmm2
# CHECK-NEXT: - - 1.00 - - - - 1.00 - - cvtps2pd %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx1.s
index 49db25cb0bdfb1..7f07fd56fe60dc 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx1.s
@@ -1137,7 +1137,7 @@ vzeroupper
# CHECK-NEXT: 2 2 1.00 vcvtps2pd %xmm0, %xmm2
# CHECK-NEXT: 2 7 1.00 * vcvtps2pd (%rax), %xmm2
# CHECK-NEXT: 2 2 1.00 vcvtps2pd %xmm0, %ymm2
-# CHECK-NEXT: 2 7 1.00 * vcvtps2pd (%rax), %ymm2
+# CHECK-NEXT: 3 7 1.00 * vcvtps2pd (%rax), %ymm2
# CHECK-NEXT: 2 5 1.00 vcvtsd2si %xmm0, %ecx
# CHECK-NEXT: 2 5 1.00 vcvtsd2si %xmm0, %rcx
# CHECK-NEXT: 3 10 1.00 * vcvtsd2si (%rax), %ecx
@@ -1152,7 +1152,7 @@ vzeroupper
# CHECK-NEXT: 3 5 2.00 vcvtsi2ss %rcx, %xmm0, %xmm2
# CHECK-NEXT: 3 10 1.00 * vcvtsi2ssl (%rax), %xmm0, %xmm2
# CHECK-NEXT: 3 10 1.00 * vcvtsi2ssq (%rax), %xmm0, %xmm2
-# CHECK-NEXT: 1 1 1.00 vcvtss2sd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 2 1 1.00 vcvtss2sd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 2 7 1.00 * vcvtss2sd (%rax), %xmm1, %xmm2
# CHECK-NEXT: 2 5 1.00 vcvtss2si %xmm0, %ecx
# CHECK-NEXT: 2 5 1.00 vcvtss2si %xmm0, %rcx
@@ -1734,7 +1734,7 @@ vzeroupper
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] ...
[truncated]
|
The folded load variants almost never require Port5 for length changing conversions (just for SNB ymm cases), and don't have an extra uop for the load.
Confirmed with a mixture of Agner + uops.info comparisons.