Skip to content

Commit 3dcc0db

Browse files
committed
[X86] Teach combineToExtendBoolVectorInReg to create opportunities for using broadcast load instructions.
If we're inserting a scalar that is smaller than the element size of the final VT, the value of the extra bits doesn't matter. Previously we any_extended in the scalar domain before inserting. This patch changes this to use a broadcast of the original scalar type and then a bitcast to the final type. This might enable the use of a broadcast load. This recovers regressions from 07d68c2 and 9fcd212 without relying on alignment of the load. Differential Revision: https://reviews.llvm.org/D75835
1 parent 8fc9eea commit 3dcc0db

File tree

5 files changed

+29
-19
lines changed

5 files changed

+29
-19
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44278,15 +44278,30 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
4427844278

4427944279
for (unsigned i = 0; i != Scale; ++i)
4428044280
ShuffleMask.append(EltSizeInBits, i);
44281+
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44282+
} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44283+
(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44284+
// If we have register broadcast instructions, use the scalar size as the
44285+
// element type for the shuffle. Then cast to the wider element type. The
44286+
// widened bits won't be used, and this might allow the use of a broadcast
44287+
// load.
44288+
assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44289+
unsigned Scale = EltSizeInBits / NumElts;
44290+
EVT BroadcastVT =
44291+
EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44292+
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44293+
ShuffleMask.append(NumElts * Scale, 0);
44294+
Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44295+
Vec = DAG.getBitcast(VT, Vec);
4428144296
} else {
4428244297
// For smaller scalar integers, we can simply any-extend it to the vector
4428344298
// element size (we don't care about the upper bits) and broadcast it to all
4428444299
// elements.
4428544300
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
4428644301
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
4428744302
ShuffleMask.append(NumElts, 0);
44303+
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
4428844304
}
44289-
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
4429044305

4429144306
// Now, mask the relevant bit in each element.
4429244307
SmallVector<SDValue, 32> Bits;

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
114114
; AVX2-LABEL: ext_i8_8i16:
115115
; AVX2: # %bb.0:
116116
; AVX2-NEXT: vmovd %edi, %xmm0
117-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
117+
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
118118
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
119119
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
120120
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -273,7 +273,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
273273
; AVX2-LABEL: ext_i8_8i32:
274274
; AVX2: # %bb.0:
275275
; AVX2-NEXT: vmovd %edi, %xmm0
276-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
276+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
277277
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
278278
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
279279
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
@@ -456,9 +456,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
456456
;
457457
; AVX2-LABEL: ext_i8_8i64:
458458
; AVX2: # %bb.0:
459-
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
460-
; AVX2-NEXT: vmovq %rdi, %xmm0
461-
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
459+
; AVX2-NEXT: vmovd %edi, %xmm0
460+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
462461
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
463462
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
464463
; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
@@ -525,7 +524,7 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
525524
; AVX2-LABEL: ext_i16_16i32:
526525
; AVX2: # %bb.0:
527526
; AVX2-NEXT: vmovd %edi, %xmm0
528-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
527+
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1
529528
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
530529
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
531530
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
141141
; AVX2-LABEL: ext_i8_8i16:
142142
; AVX2: # %bb.0:
143143
; AVX2-NEXT: vmovd %edi, %xmm0
144-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
144+
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
145145
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
146146
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
147147
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -344,7 +344,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
344344
; AVX2-LABEL: ext_i8_8i32:
345345
; AVX2: # %bb.0:
346346
; AVX2-NEXT: vmovd %edi, %xmm0
347-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
347+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
348348
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
349349
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
350350
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
@@ -583,9 +583,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
583583
;
584584
; AVX2-LABEL: ext_i8_8i64:
585585
; AVX2: # %bb.0:
586-
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
587-
; AVX2-NEXT: vmovq %rdi, %xmm0
588-
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
586+
; AVX2-NEXT: vmovd %edi, %xmm0
587+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
589588
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
590589
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
591590
; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
@@ -670,7 +669,7 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
670669
; AVX2-LABEL: ext_i16_16i32:
671670
; AVX2: # %bb.0:
672671
; AVX2-NEXT: vmovd %edi, %xmm0
673-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
672+
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1
674673
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
675674
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
676675
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
115115
; AVX2-LABEL: bitcast_i8_8i1:
116116
; AVX2: # %bb.0:
117117
; AVX2-NEXT: vmovd %edi, %xmm0
118-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
118+
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
119119
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
120120
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
121121
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0

llvm/test/CodeGen/X86/vector-sext.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,9 +2039,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
20392039
;
20402040
; AVX2-LABEL: load_sext_8i1_to_8i16:
20412041
; AVX2: # %bb.0: # %entry
2042-
; AVX2-NEXT: movzwl (%rdi), %eax
2043-
; AVX2-NEXT: vmovd %eax, %xmm0
2044-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
2042+
; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
20452043
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
20462044
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
20472045
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -2261,8 +2259,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
22612259
;
22622260
; AVX2-LABEL: load_sext_8i1_to_8i32:
22632261
; AVX2: # %bb.0: # %entry
2264-
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2265-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
2262+
; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
22662263
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
22672264
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
22682265
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0

0 commit comments

Comments
 (0)