Skip to content

Commit d1592a9

Browse files
authored
[X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (#140028)
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower. Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved). NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable. Fixes #135998
1 parent eea6969 commit d1592a9

29 files changed

+285
-162
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
347347
bool HasBWI = ST->hasBWI();
348348
bool HasVLX = ST->hasVLX();
349349
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
350+
bool OptSize = MF.getFunction().hasOptSize();
350351

351352
struct FixupEntry {
352353
int Op;
@@ -355,6 +356,36 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
355356
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
356357
RebuildConstant;
357358
};
359+
360+
auto NewOpcPreferable = [&](const FixupEntry &Fixup,
361+
unsigned RegBitWidth) -> bool {
362+
if (SM->hasInstrSchedModel()) {
363+
unsigned NewOpc = Fixup.Op;
364+
auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
365+
auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
366+
unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
367+
368+
// Compare tput/lat - avoid any regressions, but allow extra cycle of
369+
// latency in exchange for each 128-bit (or less) constant pool reduction
370+
// (this is a very simple cost:benefit estimate - there will probably be
371+
// better ways to calculate this).
372+
double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
373+
double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
374+
if (OldTput != NewTput)
375+
return NewTput < OldTput;
376+
377+
int LatTol = (BitsSaved + 127) / 128;
378+
int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
379+
int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
380+
if (OldLat != NewLat)
381+
return NewLat < (OldLat + LatTol);
382+
}
383+
384+
// We either were unable to get tput/lat or all values were equal.
385+
// Prefer the new opcode for reduced constant pool size.
386+
return true;
387+
};
388+
358389
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
359390
unsigned OperandNo) {
360391
#ifdef EXPENSIVE_CHECKS
@@ -371,7 +402,11 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
371402
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
372403
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
373404
for (const FixupEntry &Fixup : Fixups) {
374-
if (Fixup.Op) {
405+
// Always uses the smallest possible constant load with opt/minsize,
406+
// otherwise use the smallest instruction that doesn't affect
407+
// performance.
408+
// TODO: If constant has been hoisted from loop, use smallest constant.
409+
if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
375410
// Construct a suitable constant and adjust the MI to use the new
376411
// constant pool entry.
377412
if (Constant *NewCst = Fixup.RebuildConstant(

llvm/test/CodeGen/X86/avgceils.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
3939
;
4040
; AVX512-LABEL: test_fixed_v16i8:
4141
; AVX512: # %bb.0:
42-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
42+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
4343
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
4444
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
4545
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
8282
;
8383
; AVX512-LABEL: test_ext_v16i8:
8484
; AVX512: # %bb.0:
85-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
85+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
8686
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
8787
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
8888
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
365365
;
366366
; AVX512-LABEL: test_fixed_v32i8:
367367
; AVX512: # %bb.0:
368-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
368+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
369369
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
370370
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
371371
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
416416
;
417417
; AVX512-LABEL: test_ext_v32i8:
418418
; AVX512: # %bb.0:
419-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
419+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
420420
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
421421
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
422422
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
875875
;
876876
; AVX512-LABEL: test_fixed_v64i8:
877877
; AVX512: # %bb.0:
878-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
878+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
879879
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
880880
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
881881
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
946946
;
947947
; AVX512-LABEL: test_ext_v64i8:
948948
; AVX512: # %bb.0:
949-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
949+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
950950
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
951951
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
952952
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0

llvm/test/CodeGen/X86/avgfloors.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
5252
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
5353
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
5454
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
55-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
55+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
5656
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
5757
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
5858
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
107107
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
108108
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
109109
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
110-
; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
110+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
111111
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
112112
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
113113
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
404404
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
405405
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
406406
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
407-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
407+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
408408
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
409409
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
410410
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
477477
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
478478
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
479479
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
480-
; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
480+
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
481481
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
482482
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
483483
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
965965
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
966966
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
967967
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
968-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
968+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
969969
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
970970
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
971971
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
10771077
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
10781078
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
10791079
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
1080-
; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1080+
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
10811081
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
10821082
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
10831083
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0

llvm/test/CodeGen/X86/avx512-build-vector.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
1515
; CHECK-LABEL: test3:
1616
; CHECK: ## %bb.0:
1717
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
18-
; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
18+
; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
1919
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2020
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
2121
; CHECK-NEXT: vmovaps %zmm1, %zmm0

llvm/test/CodeGen/X86/combine-or-shuffle.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
836836
;
837837
; AVX512-LABEL: or_and_v4i32:
838838
; AVX512: # %bb.0:
839-
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
839+
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
840840
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
841841
; AVX512-NEXT: retq
842842
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>

llvm/test/CodeGen/X86/combine-or.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
2929
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
3030
; SSE-NEXT: retq
3131
;
32-
; AVX1-LABEL: or_zext_v2i32:
33-
; AVX1: # %bb.0:
34-
; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
35-
; AVX1-NEXT: retq
36-
;
37-
; AVX2-LABEL: or_zext_v2i32:
38-
; AVX2: # %bb.0:
39-
; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
40-
; AVX2-NEXT: # xmm0 = mem[0,0]
41-
; AVX2-NEXT: retq
32+
; AVX-LABEL: or_zext_v2i32:
33+
; AVX: # %bb.0:
34+
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
35+
; AVX-NEXT: # xmm0 = mem[0,0]
36+
; AVX-NEXT: retq
4237
%1 = zext <2 x i32> %a0 to <2 x i64>
4338
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
4439
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
261256
;
262257
; AVX2-LABEL: PR89533:
263258
; AVX2: # %bb.0:
264-
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
259+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
265260
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
266261
; AVX2-NEXT: vpmovmskb %ymm0, %eax
267262
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0

llvm/test/CodeGen/X86/constant-pool-sharing.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
105105
;
106106
; AVX-LINUX-LABEL: store_repeated_constants:
107107
; AVX-LINUX: # %bb.0:
108-
; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
109-
; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
108+
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
110109
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
111110
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
112111
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
119118
;
120119
; AVX-MSVC-LABEL: store_repeated_constants:
121120
; AVX-MSVC: # %bb.0:
122-
; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
123-
; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
121+
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
124122
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
125123
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
126124
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1

llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
389389
;
390390
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
391391
; CHECK-FMA: # %bb.0:
392-
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
392+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
393393
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
394394
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
395395
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
649649
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
650650
; CHECK-SSE-NEXT: retq
651651
;
652-
; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
653-
; CHECK-AVX: # %bb.0:
654-
; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
655-
; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
656-
; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
657-
; CHECK-AVX-NEXT: retq
652+
; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
653+
; CHECK-AVX2: # %bb.0:
654+
; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
655+
; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
656+
; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
657+
; CHECK-AVX2-NEXT: retq
658+
;
659+
; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
660+
; CHECK-NO-FASTFMA: # %bb.0:
661+
; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
662+
; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
663+
; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
664+
; CHECK-NO-FASTFMA-NEXT: retq
665+
;
666+
; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
667+
; CHECK-FMA: # %bb.0:
668+
; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
669+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
670+
; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
671+
; CHECK-FMA-NEXT: retq
658672
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
659673
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
660674
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
11351149
;
11361150
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
11371151
; CHECK-FMA: # %bb.0:
1138-
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
1152+
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
11391153
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
11401154
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
11411155
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0

llvm/test/CodeGen/X86/fpclamptosat_vec.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
198198
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
199199
; AVX2-NEXT: vmovq %rax, %xmm0
200200
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
201-
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
201+
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
202202
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
203203
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
204204
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -576,7 +576,8 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
576576
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
577577
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
578578
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
579-
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
579+
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
580+
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
580581
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
581582
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
582583
; AVX2-NEXT: vzeroupper
@@ -1023,7 +1024,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
10231024
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
10241025
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
10251026
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
1026-
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
1027+
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
1028+
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
10271029
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
10281030
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
10291031
; AVX2-NEXT: vzeroupper
@@ -2817,7 +2819,7 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind {
28172819
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
28182820
; AVX2-NEXT: vmovq %rax, %xmm0
28192821
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2820-
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
2822+
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
28212823
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
28222824
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
28232825
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -3190,7 +3192,8 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind {
31903192
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
31913193
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
31923194
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
3193-
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
3195+
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3196+
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
31943197
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
31953198
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
31963199
; AVX2-NEXT: vzeroupper
@@ -3632,7 +3635,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
36323635
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
36333636
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
36343637
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
3635-
; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
3638+
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
3639+
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
36363640
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
36373641
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
36383642
; AVX2-NEXT: vzeroupper

0 commit comments

Comments
 (0)