Skip to content

Commit 09b7e0f

Browse files
committed
[AVX-512] Replace V_SET0 in AVX-512 patterns with AVX512_128_SET0. Enhance AVX512_128_SET0 expansion to make this possible.
We'll now expand AVX512_128_SET0 to an EVEX VXORD if VLX available. Or if its not, but register allocation has selected a non-extended register we will use VEX VXORPS. And if its an extended register without VLX we'll use a 512-bit XOR. Do the same for AVX512_FsFLD0SS/SD. This makes it possible for the register allocator to have all 32 registers available to work with. llvm-svn: 292004
1 parent 0616b5f commit 09b7e0f

File tree

4 files changed

+47
-28
lines changed

4 files changed

+47
-28
lines changed

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
460460
}
461461

462462
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
463-
isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
463+
isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
464464
def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
465465
[(set VR128X:$dst, (v4i32 immAllZerosV))]>;
466466
def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
@@ -470,7 +470,7 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
470470
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
471471
// This is expanded by ExpandPostRAPseudos.
472472
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
473-
isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
473+
isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
474474
def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
475475
[(set FR32X:$dst, fp32imm0)]>;
476476
def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -3439,31 +3439,31 @@ let Predicates = [HasAVX512] in {
34393439
// Move scalar to XMM zero-extended, zeroing a VR128X then do a
34403440
// MOVS{S,D} to the lower bits.
34413441
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
3442-
(VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
3442+
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>;
34433443
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
3444-
(VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
3444+
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
34453445
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
3446-
(VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
3446+
(VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
34473447
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
3448-
(VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
3448+
(VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>;
34493449
}
34503450

34513451
// Move low f32 and clear high bits.
34523452
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
34533453
(SUBREG_TO_REG (i32 0),
3454-
(VMOVSSZrr (v4f32 (V_SET0)),
3454+
(VMOVSSZrr (v4f32 (AVX512_128_SET0)),
34553455
(EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
34563456
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
34573457
(SUBREG_TO_REG (i32 0),
3458-
(VMOVSSZrr (v4i32 (V_SET0)),
3458+
(VMOVSSZrr (v4i32 (AVX512_128_SET0)),
34593459
(EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
34603460
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
34613461
(SUBREG_TO_REG (i32 0),
3462-
(VMOVSSZrr (v4f32 (V_SET0)),
3462+
(VMOVSSZrr (v4f32 (AVX512_128_SET0)),
34633463
(EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
34643464
def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
34653465
(SUBREG_TO_REG (i32 0),
3466-
(VMOVSSZrr (v4i32 (V_SET0)),
3466+
(VMOVSSZrr (v4i32 (AVX512_128_SET0)),
34673467
(EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
34683468

34693469
let AddedComplexity = 20 in {
@@ -3525,11 +3525,11 @@ let Predicates = [HasAVX512] in {
35253525
}
35263526
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
35273527
(v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
3528-
(SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
3528+
(SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
35293529
FR32X:$src)), sub_xmm)>;
35303530
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
35313531
(v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
3532-
(SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
3532+
(SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
35333533
FR64X:$src)), sub_xmm)>;
35343534
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
35353535
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
@@ -3538,18 +3538,18 @@ let Predicates = [HasAVX512] in {
35383538
// Move low f64 and clear high bits.
35393539
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
35403540
(SUBREG_TO_REG (i32 0),
3541-
(VMOVSDZrr (v2f64 (V_SET0)),
3541+
(VMOVSDZrr (v2f64 (AVX512_128_SET0)),
35423542
(EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
35433543
def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
35443544
(SUBREG_TO_REG (i32 0),
3545-
(VMOVSDZrr (v2f64 (V_SET0)),
3545+
(VMOVSDZrr (v2f64 (AVX512_128_SET0)),
35463546
(EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
35473547

35483548
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
3549-
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
3549+
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
35503550
(EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
35513551
def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
3552-
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
3552+
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
35533553
(EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
35543554

35553555
// Extract and store.

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6831,14 +6831,33 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
68316831
assert(HasAVX && "AVX not supported");
68326832
return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
68336833
case X86::AVX512_128_SET0:
6834-
return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
6835-
case X86::AVX512_256_SET0:
6836-
return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
6834+
case X86::AVX512_FsFLD0SS:
6835+
case X86::AVX512_FsFLD0SD: {
6836+
bool HasVLX = Subtarget.hasVLX();
6837+
unsigned SrcReg = MIB->getOperand(0).getReg();
6838+
const TargetRegisterInfo *TRI = &getRegisterInfo();
6839+
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6840+
return Expand2AddrUndef(MIB,
6841+
get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6842+
// Extended register without VLX. Use a larger XOR.
6843+
SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6844+
MIB->getOperand(0).setReg(SrcReg);
6845+
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6846+
}
6847+
case X86::AVX512_256_SET0: {
6848+
bool HasVLX = Subtarget.hasVLX();
6849+
unsigned SrcReg = MIB->getOperand(0).getReg();
6850+
const TargetRegisterInfo *TRI = &getRegisterInfo();
6851+
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6852+
return Expand2AddrUndef(MIB,
6853+
get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr));
6854+
// Extended register without VLX. Use a larger XOR.
6855+
SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6856+
MIB->getOperand(0).setReg(SrcReg);
6857+
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6858+
}
68376859
case X86::AVX512_512_SET0:
68386860
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6839-
case X86::AVX512_FsFLD0SS:
6840-
case X86::AVX512_FsFLD0SD:
6841-
return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
68426861
case X86::V_SETALLONES:
68436862
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
68446863
case X86::AVX2_SETALLONES:

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -446,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
446446
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
447447
isPseudo = 1, SchedRW = [WriteZero] in {
448448
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
449-
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
449+
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
450450
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
451-
[(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
451+
[(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
452452
}
453453

454454
//===----------------------------------------------------------------------===//
@@ -461,12 +461,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
461461
// We set canFoldAsLoad because this can be converted to a constant-pool
462462
// load of an all-zeros value if folding it would be beneficial.
463463
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
464-
isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
464+
isPseudo = 1, SchedRW = [WriteZero] in {
465465
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
466466
[(set VR128:$dst, (v4f32 immAllZerosV))]>;
467467
}
468468

469-
let Predicates = [NoVLX] in
469+
let Predicates = [NoAVX512] in
470470
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
471471

472472

@@ -475,7 +475,7 @@ def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
475475
// at the rename stage without using any execution unit, so SET0PSY
476476
// and SET0PDY can be used for vector int instructions without penalty
477477
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
478-
isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
478+
isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
479479
def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
480480
[(set VR256:$dst, (v8i32 immAllZerosV))]>;
481481
}

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1229,7 +1229,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
12291229
;
12301230
; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
12311231
; AVX512VL: # BB#0:
1232-
; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1232+
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
12331233
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
12341234
; AVX512VL-NEXT: retq
12351235
%v = insertelement <4 x double> undef, double %a, i32 0

0 commit comments

Comments
 (0)