Skip to content

Commit 2a82da8

Browse files
committed
[X86] combineConcatVectorOps - handle splatting of a X86ISD::SUBV_BROADCAST_LOAD node
If we're splatting the original subvector width then just use the original X86ISD::SUBV_BROADCAST_LOAD node - similar to what we're already doing with X86ISD::VBROADCAST/VBROADCAST_LOAD
1 parent 85cf2e8 commit 2a82da8

7 files changed

+279
-292
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54334,10 +54334,16 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5433454334

5433554335
// concat_vectors(extract_subvector(broadcast(x)),
5433654336
// extract_subvector(broadcast(x))) -> broadcast(x)
54337+
// concat_vectors(extract_subvector(subv_broadcast(x)),
54338+
// extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
5433754339
if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5433854340
Op0.getOperand(0).getValueType() == VT) {
54339-
if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
54340-
Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
54341+
SDValue SrcVec = Op0.getOperand(0);
54342+
if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
54343+
SrcVec.getOpcode() == X86ISD::VBROADCAST_LOAD)
54344+
return Op0.getOperand(0);
54345+
if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
54346+
Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
5434154347
return Op0.getOperand(0);
5434254348
}
5434354349
}

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,10 +1088,9 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
10881088
; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
10891089
; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
10901090
; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1091-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4
1092-
; AVX512BW-NEXT: vpshufb %zmm4, %zmm2, %zmm2
1093-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
1094-
; AVX512BW-NEXT: vpshufb %zmm4, %zmm5, %zmm4
1091+
; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
1092+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4
1093+
; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm4
10951094
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
10961095
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
10971096
; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx)

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2127,11 +2127,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
21272127
; AVX512BW-FAST-LABEL: store_i8_stride5_vf32:
21282128
; AVX512BW-FAST: # %bb.0:
21292129
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1
2130-
; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm3
2130+
; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm4
21312131
; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm0
21322132
; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm2
2133-
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm4
2134-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
2133+
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm3
2134+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
21352135
; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm6
21362136
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
21372137
; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5
@@ -2144,19 +2144,19 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
21442144
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
21452145
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero
21462146
; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7
2147-
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
2148-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2149-
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
2150-
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5]
2147+
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
2148+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2149+
; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
2150+
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
21512151
; AVX512BW-FAST-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C
21522152
; AVX512BW-FAST-NEXT: kmovq %rax, %k1
2153-
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1}
2153+
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
21542154
; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
21552155
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
21562156
; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6
21572157
; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
21582158
; AVX512BW-FAST-NEXT: kmovq %rax, %k1
2159-
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
2159+
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1}
21602160
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
21612161
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
21622162
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
@@ -2166,7 +2166,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
21662166
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
21672167
; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7
21682168
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
2169-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
2169+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
21702170
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
21712171
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
21722172
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
@@ -2175,22 +2175,21 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
21752175
; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8
21762176
; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084
21772177
; AVX512BW-FAST-NEXT: kmovd %eax, %k1
2178-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2178+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
21792179
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
21802180
; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
21812181
; AVX512BW-FAST-NEXT: kmovq %rax, %k1
21822182
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
2183-
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm6
2184-
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14>
2185-
; AVX512BW-FAST-NEXT: vpermd %zmm6, %zmm8, %zmm6
2183+
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14>
2184+
; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6
21862185
; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
21872186
; AVX512BW-FAST-NEXT: kmovq %rax, %k1
21882187
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
2189-
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
2190-
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
2188+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
2189+
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
21912190
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
21922191
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
2193-
; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
2192+
; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1
21942193
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
21952194
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
21962195
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
@@ -2206,7 +2205,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
22062205
; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
22072206
; AVX512BW-FAST-NEXT: vmovdqa %ymm0, 128(%r9)
22082207
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r9)
2209-
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, (%r9)
2208+
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%r9)
22102209
; AVX512BW-FAST-NEXT: vzeroupper
22112210
; AVX512BW-FAST-NEXT: retq
22122211
%in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64

0 commit comments

Comments
 (0)