Skip to content

Commit 88083a0

Browse files
authored
[X86] SimplifyDemandedVectorEltsForTargetNode - handle 512-bit X86ISD::VPERMI with lower half demanded elts (llvm#137139)
512-bit X86ISD::VPERMI nodes handle the lower/upper 256-bits separately - so if we don't demand the upper half elements, we can just use the 256-bit variant.
1 parent 82c25d2 commit 88083a0

File tree

3 files changed

+29
-23
lines changed

3 files changed

+29
-23
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43786,7 +43786,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4378643786
return TLO.CombineTo(Op, Insert);
4378743787
}
4378843788
case X86ISD::VPERMI: {
43789-
// Simplify PERMPD/PERMQ to extract_subvector.
43789+
// Simplify 256-bit PERMPD/PERMQ to extract_subvector.
4379043790
// TODO: This should be done in shuffle combining.
4379143791
if (VT == MVT::v4f64 || VT == MVT::v4i64) {
4379243792
SmallVector<int, 4> Mask;
@@ -43799,6 +43799,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4379943799
return TLO.CombineTo(Op, Insert);
4380043800
}
4380143801
}
43802+
// Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
43803+
if (VT == MVT::v8f64 || VT == MVT::v8i64) {
43804+
SDLoc DL(Op);
43805+
SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
43806+
SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
43807+
Op.getOperand(1));
43808+
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43809+
SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
43810+
return TLO.CombineTo(Op, Insert);
43811+
}
4380243812
break;
4380343813
}
4380443814
case X86ISD::VPERMV: {

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2283,8 +2283,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i
22832283
define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
22842284
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
22852285
; CHECK: # %bb.0:
2286-
; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2287-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2286+
; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,3]
2287+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
22882288
; CHECK-NEXT: vzeroupper
22892289
; CHECK-NEXT: retq
22902290
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2293,9 +2293,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
22932293
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
22942294
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
22952295
; CHECK: # %bb.0:
2296-
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2296+
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
22972297
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2298-
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,2,3,7,4,6,7]
2298+
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,3]
22992299
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
23002300
; CHECK-NEXT: vzeroupper
23012301
; CHECK-NEXT: retq
@@ -2308,8 +2308,8 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
23082308
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
23092309
; CHECK: # %bb.0:
23102310
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2311-
; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
2312-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2311+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,3]
2312+
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
23132313
; CHECK-NEXT: vzeroupper
23142314
; CHECK-NEXT: retq
23152315
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>

llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
8484
;
8585
; AVX512-LABEL: load_i64_stride3_vf2:
8686
; AVX512: # %bb.0:
87-
; AVX512-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
87+
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
8888
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
8989
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
9090
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -97,9 +97,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
9797
; AVX512-FCP-LABEL: load_i64_stride3_vf2:
9898
; AVX512-FCP: # %bb.0:
9999
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
100-
; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1
101-
; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
102-
; AVX512-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
100+
; AVX512-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
101+
; AVX512-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
103102
; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2
104103
; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
105104
; AVX512-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -110,7 +109,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
110109
;
111110
; AVX512DQ-LABEL: load_i64_stride3_vf2:
112111
; AVX512DQ: # %bb.0:
113-
; AVX512DQ-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
112+
; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
114113
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
115114
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
116115
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -123,9 +122,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
123122
; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2:
124123
; AVX512DQ-FCP: # %bb.0:
125124
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
126-
; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1
127-
; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
128-
; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
125+
; AVX512DQ-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
126+
; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
129127
; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2
130128
; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
131129
; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -136,7 +134,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
136134
;
137135
; AVX512BW-LABEL: load_i64_stride3_vf2:
138136
; AVX512BW: # %bb.0:
139-
; AVX512BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
137+
; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
140138
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
141139
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
142140
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -149,9 +147,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
149147
; AVX512BW-FCP-LABEL: load_i64_stride3_vf2:
150148
; AVX512BW-FCP: # %bb.0:
151149
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
152-
; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1
153-
; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
154-
; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
150+
; AVX512BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
151+
; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
155152
; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
156153
; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
157154
; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -162,7 +159,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
162159
;
163160
; AVX512DQ-BW-LABEL: load_i64_stride3_vf2:
164161
; AVX512DQ-BW: # %bb.0:
165-
; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
162+
; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
166163
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
167164
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
168165
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -175,9 +172,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
175172
; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2:
176173
; AVX512DQ-BW-FCP: # %bb.0:
177174
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
178-
; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1
179-
; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
180-
; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
175+
; AVX512DQ-BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
176+
; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
181177
; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
182178
; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
183179
; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rsi)

0 commit comments

Comments
 (0)