Skip to content

Commit 4649a05

Browse files
committed
[X86] Add DAG combine to turn (vzmovl (insert_subvector undef, X, 0)) into (insert_subvector allzeros, (vzmovl X), 0)
128/256 bit scalar_to_vectors are canonicalized to (insert_subvector undef, (scalar_to_vector), 0). We have isel patterns that try to match this pattern being used by a vzmovl to use a 128-bit instruction and a subreg_to_reg. This patch detects the insert_subvector undef portion of this and pulls it through the vzmovl, creating a narrower vzmovl and an insert_subvector allzeroes. We can then match the insertsubvector into a subreg_to_reg operation by itself. Then we can fall back on existing (vzmovl (scalar_to_vector)) patterns. Note, while the scalar_to_vector case is the motivating case I didn't restrict to just that case. I'm also wondering about shrinking any 256/512 vzmovl to an extract_subvector+vzmovl+insert_subvector(allzeros) but I fear that would have bad implications to shuffle combining. I also think there is more canonicalization we can do with vzmovl with loads or scalar_to_vector with loads to create vzload. Differential Revision: https://reviews.llvm.org/D63512 llvm-svn: 364095
1 parent 410b650 commit 4649a05

File tree

6 files changed

+25
-70
lines changed

6 files changed

+25
-70
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33658,6 +33658,22 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
3365833658
}
3365933659
}
3366033660

33661+
// Pull subvector inserts into undef through VZEXT_MOVL by making it an
33662+
// insert into a zero vector. This helps get VZEXT_MOVL closer to
33663+
// scalar_to_vectors where 256/512 are canonicalized to an insert and a
33664+
// 128-bit scalar_to_vector. This reduces the number of isel patterns.
33665+
if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
33666+
N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
33667+
N->getOperand(0).hasOneUse() &&
33668+
N->getOperand(0).getOperand(0).isUndef() &&
33669+
isNullConstant(N->getOperand(0).getOperand(2))) {
33670+
SDValue In = N->getOperand(0).getOperand(1);
33671+
SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
33672+
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
33673+
getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
33674+
Movl, N->getOperand(0).getOperand(2));
33675+
}
33676+
3366133677
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
3366233678
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
3366333679
// FIXME: This can probably go away once we default to widening legalization.

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4329,39 +4329,17 @@ let Predicates = [HasAVX512] in {
43294329

43304330
// Represent the same patterns above but in the form they appear for
43314331
// 256-bit types
4332-
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4333-
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4334-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
4335-
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
4336-
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
4337-
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
43384332
def : Pat<(v8f32 (X86vzload addr:$src)),
43394333
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
4340-
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
4341-
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
4342-
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
43434334
def : Pat<(v4f64 (X86vzload addr:$src)),
43444335
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
43454336

43464337
// Represent the same patterns above but in the form they appear for
43474338
// 512-bit types
4348-
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
4349-
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4350-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
4351-
def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
4352-
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
4353-
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
43544339
def : Pat<(v16f32 (X86vzload addr:$src)),
43554340
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
4356-
def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
4357-
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
4358-
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
43594341
def : Pat<(v8f64 (X86vzload addr:$src)),
43604342
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
4361-
4362-
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4363-
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4364-
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
43654343
}
43664344

43674345
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
@@ -4380,14 +4358,6 @@ let Predicates = [HasAVX512] in {
43804358
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
43814359
(VMOV64toPQIZrr GR64:$src)>;
43824360

4383-
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4384-
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4385-
(SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
4386-
4387-
def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
4388-
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4389-
(SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
4390-
43914361
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
43924362
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
43934363
(VMOVDI2PDIZrm addr:$src)>;
@@ -4408,14 +4378,6 @@ let Predicates = [HasAVX512] in {
44084378
def : Pat<(v4i64 (X86vzload addr:$src)),
44094379
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
44104380

4411-
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4412-
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4413-
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4414-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
4415-
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
4416-
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4417-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
4418-
44194381
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
44204382
def : Pat<(v16i32 (X86vzload addr:$src)),
44214383
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -283,14 +283,8 @@ let Predicates = [UseAVX] in {
283283

284284
// Represent the same patterns above but in the form they appear for
285285
// 256-bit types
286-
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
287-
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
288-
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
289286
def : Pat<(v8f32 (X86vzload addr:$src)),
290287
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291-
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
292-
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
293-
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
294288
def : Pat<(v4f64 (X86vzload addr:$src)),
295289
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
296290
}
@@ -4145,9 +4139,6 @@ let Predicates = [UseAVX] in {
41454139
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
41464140
(VMOV64toPQIrr GR64:$src)>;
41474141

4148-
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4149-
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4150-
(SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
41514142
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
41524143
// These instructions also write zeros in the high part of a 256-bit register.
41534144
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
@@ -4158,15 +4149,8 @@ let Predicates = [UseAVX] in {
41584149
(VMOVDI2PDIrm addr:$src)>;
41594150
def : Pat<(v4i32 (X86vzload addr:$src)),
41604151
(VMOVDI2PDIrm addr:$src)>;
4161-
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4162-
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4163-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
41644152
def : Pat<(v8i32 (X86vzload addr:$src)),
41654153
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4166-
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4167-
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4168-
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4169-
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
41704154
}
41714155

41724156
let Predicates = [UseSSE2] in {
@@ -4253,9 +4237,6 @@ let Predicates = [UseAVX] in {
42534237
(VMOVQI2PQIrm addr:$src)>;
42544238
def : Pat<(v2i64 (X86vzload addr:$src)),
42554239
(VMOVQI2PQIrm addr:$src)>;
4256-
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4257-
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4258-
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
42594240
def : Pat<(v4i64 (X86vzload addr:$src)),
42604241
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
42614242

llvm/test/CodeGen/X86/avx-load-store.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ define void @f_f() nounwind {
240240
; CHECK_O0-NEXT: .LBB9_3: # %cif_mixed_test_all
241241
; CHECK_O0-NEXT: movl $-1, %eax
242242
; CHECK_O0-NEXT: vmovd %eax, %xmm0
243+
; CHECK_O0-NEXT: vmovdqa %xmm0, %xmm0
243244
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
244245
; CHECK_O0-NEXT: # implicit-def: $rcx
245246
; CHECK_O0-NEXT: # implicit-def: $ymm2

llvm/test/CodeGen/X86/vec_extract-avx.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,15 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
144144
; X32: # %bb.0:
145145
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
146146
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
147-
; X32-NEXT: vmovdqu (%ecx), %xmm0
148-
; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
149-
; X32-NEXT: vmovdqa %ymm0, (%eax)
147+
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
148+
; X32-NEXT: vmovaps %ymm0, (%eax)
150149
; X32-NEXT: vzeroupper
151150
; X32-NEXT: retl
152151
;
153152
; X64-LABEL: legal_vzmovl_2i64_4i64:
154153
; X64: # %bb.0:
155-
; X64-NEXT: vmovdqu (%rdi), %xmm0
156-
; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
157-
; X64-NEXT: vmovdqa %ymm0, (%rsi)
154+
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
155+
; X64-NEXT: vmovaps %ymm0, (%rsi)
158156
; X64-NEXT: vzeroupper
159157
; X64-NEXT: retq
160158
%ld = load <2 x i64>, <2 x i64>* %in, align 8
@@ -196,17 +194,15 @@ define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
196194
; X32: # %bb.0:
197195
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
198196
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
199-
; X32-NEXT: vmovdqu (%ecx), %xmm0
200-
; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
201-
; X32-NEXT: vmovdqa %ymm0, (%eax)
197+
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
198+
; X32-NEXT: vmovaps %ymm0, (%eax)
202199
; X32-NEXT: vzeroupper
203200
; X32-NEXT: retl
204201
;
205202
; X64-LABEL: legal_vzmovl_2f64_4f64:
206203
; X64: # %bb.0:
207-
; X64-NEXT: vmovdqu (%rdi), %xmm0
208-
; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
209-
; X64-NEXT: vmovdqa %ymm0, (%rsi)
204+
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
205+
; X64-NEXT: vmovaps %ymm0, (%rsi)
210206
; X64-NEXT: vzeroupper
211207
; X64-NEXT: retq
212208
%ld = load <2 x double>, <2 x double>* %in, align 8

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1514,7 +1514,6 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
15141514
define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
15151515
; ALL-LABEL: insert_reg_and_zero_v4f64:
15161516
; ALL: # %bb.0:
1517-
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
15181517
; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
15191518
; ALL-NEXT: retq
15201519
%v = insertelement <4 x double> undef, double %a, i32 0

0 commit comments

Comments
 (0)