Skip to content

Commit 135b877

Browse files
committed
[X86] Replace selectScalarSSELoad ComplexPattern with PatFrags to handle the 3 types of loads we currently match.
This ensures we create mem operands for these instructions fixing PR45949. Unfortunately, it increases the size of X86GenDAGISel.inc, but some dag combine canonicalization could reduce the types of load we need to match.
1 parent 0ec5f50 commit 135b877

File tree

5 files changed

+63
-138
lines changed

5 files changed

+63
-138
lines changed

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,6 @@ namespace {
229229
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230230
SDValue &Scale, SDValue &Index, SDValue &Disp,
231231
SDValue &Segment);
232-
bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
233-
SDValue &Base, SDValue &Scale,
234-
SDValue &Index, SDValue &Disp,
235-
SDValue &Segment,
236-
SDValue &NodeWithChain);
237232
bool selectRelocImm(SDValue N, SDValue &Op);
238233

239234
bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
@@ -2473,76 +2468,6 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
24732468
return true;
24742469
}
24752470

2476-
// We can only fold a load if all nodes between it and the root node have a
2477-
// single use. If there are additional uses, we could end up duplicating the
2478-
// load.
2479-
static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2480-
while (User != Root) {
2481-
if (!User->hasOneUse())
2482-
return false;
2483-
User = *User->use_begin();
2484-
}
2485-
2486-
return true;
2487-
}
2488-
2489-
/// Match a scalar SSE load. In particular, we want to match a load whose top
2490-
/// elements are either undef or zeros. The load flavor is derived from the
2491-
/// type of N, which is either v4f32 or v2f64.
2492-
///
2493-
/// We also return:
2494-
/// PatternChainNode: this is the matched node that has a chain input and
2495-
/// output.
2496-
bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2497-
SDValue N, SDValue &Base,
2498-
SDValue &Scale, SDValue &Index,
2499-
SDValue &Disp, SDValue &Segment,
2500-
SDValue &PatternNodeWithChain) {
2501-
if (!hasSingleUsesFromRoot(Root, Parent))
2502-
return false;
2503-
2504-
// We can allow a full vector load here since narrowing a load is ok unless
2505-
// it's volatile or atomic.
2506-
if (ISD::isNON_EXTLoad(N.getNode())) {
2507-
LoadSDNode *LD = cast<LoadSDNode>(N);
2508-
if (LD->isSimple() &&
2509-
IsProfitableToFold(N, LD, Root) &&
2510-
IsLegalToFold(N, Parent, Root, OptLevel)) {
2511-
PatternNodeWithChain = N;
2512-
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2513-
Segment);
2514-
}
2515-
}
2516-
2517-
// We can also match the special zero extended load opcode.
2518-
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2519-
PatternNodeWithChain = N;
2520-
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2521-
IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2522-
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2523-
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2524-
Segment);
2525-
}
2526-
}
2527-
2528-
// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2529-
// once. Otherwise the load might get duplicated and the chain output of the
2530-
// duplicate load will not be observed by all dependencies.
2531-
if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2532-
PatternNodeWithChain = N.getOperand(0);
2533-
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2534-
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2535-
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2536-
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2537-
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2538-
Segment);
2539-
}
2540-
}
2541-
2542-
return false;
2543-
}
2544-
2545-
25462471
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
25472472
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
25482473
uint64_t ImmVal = CN->getZExtValue();

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,11 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
7676
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
7777
PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
7878

79-
ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
80-
!cast<ComplexPattern>("sse_load_f32"),
81-
!if (!eq (EltTypeName, "f64"),
82-
!cast<ComplexPattern>("sse_load_f64"),
83-
?));
79+
PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
80+
!cast<PatFrags>("sse_load_f32"),
81+
!if (!eq (EltTypeName, "f64"),
82+
!cast<PatFrags>("sse_load_f64"),
83+
?));
8484

8585
// The string to specify embedded broadcast in assembly.
8686
string BroadcastStr = "{1to" # NumElts # "}";
@@ -2065,9 +2065,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
20652065
(ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
20662066
"vcmp"#_.Suffix,
20672067
"$cc, $src2, $src1", "$src1, $src2, $cc",
2068-
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
2068+
(OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
20692069
timm:$cc),
2070-
(OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
2070+
(OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
20712071
timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
20722072
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
20732073

@@ -2643,15 +2643,15 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
26432643
OpcodeStr#_.Suffix#
26442644
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
26452645
[(set _.KRC:$dst,
2646-
(X86Vfpclasss _.ScalarIntMemCPat:$src1,
2647-
(i32 timm:$src2)))]>,
2646+
(X86Vfpclasss (_.ScalarIntMemFrags addr:$src1),
2647+
(i32 timm:$src2)))]>,
26482648
Sched<[sched.Folded, sched.ReadAfterFold]>;
26492649
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
26502650
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
26512651
OpcodeStr#_.Suffix#
26522652
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
26532653
[(set _.KRC:$dst,(and _.KRCWM:$mask,
2654-
(X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
2654+
(X86Vfpclasss_su (_.ScalarIntMemFrags addr:$src1),
26552655
(i32 timm:$src2))))]>,
26562656
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
26572657
}
@@ -5293,7 +5293,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
52935293
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
52945294
"$src2, $src1", "$src1, $src2",
52955295
(_.VT (VecNode _.RC:$src1,
5296-
_.ScalarIntMemCPat:$src2))>,
5296+
(_.ScalarIntMemFrags addr:$src2)))>,
52975297
Sched<[sched.Folded, sched.ReadAfterFold]>;
52985298
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
52995299
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5339,7 +5339,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
53395339
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
53405340
"$src2, $src1", "$src1, $src2",
53415341
(_.VT (VecNode _.RC:$src1,
5342-
_.ScalarIntMemCPat:$src2))>,
5342+
(_.ScalarIntMemFrags addr:$src2)))>,
53435343
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
53445344

53455345
let isCodeGenOnly = 1, Predicates = [HasAVX512],
@@ -5628,7 +5628,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
56285628
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
56295629
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix,
56305630
"$src2, $src1", "$src1, $src2",
5631-
(OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>,
5631+
(OpNode _.RC:$src1, (_.ScalarIntMemFrags addr:$src2))>,
56325632
Sched<[sched.Folded, sched.ReadAfterFold]>;
56335633
}
56345634
}
@@ -7227,7 +7227,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
72277227
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
72287228
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
72297229
[(set DstVT.RC:$dst, (OpNode
7230-
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
7230+
(SrcVT.ScalarIntMemFrags addr:$src)))]>,
72317231
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
72327232
} // Predicates = [HasAVX512]
72337233

@@ -7419,7 +7419,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
74197419
(ins _SrcRC.IntScalarMemOp:$src),
74207420
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
74217421
[(set _DstRC.RC:$dst,
7422-
(OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
7422+
(OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
74237423
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
74247424
} //HasAVX512
74257425

@@ -7476,7 +7476,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
74767476
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
74777477
"$src2, $src1", "$src1, $src2",
74787478
(_.VT (OpNode (_.VT _.RC:$src1),
7479-
(_Src.VT _Src.ScalarIntMemCPat:$src2)))>,
7479+
(_Src.ScalarIntMemFrags addr:$src2)))>,
74807480
EVEX_4V, VEX_LIG,
74817481
Sched<[sched.Folded, sched.ReadAfterFold]>;
74827482

@@ -8710,7 +8710,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
87108710
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
87118711
"$src2, $src1", "$src1, $src2",
87128712
(OpNode (_.VT _.RC:$src1),
8713-
_.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG,
8713+
(_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
87148714
Sched<[sched.Folded, sched.ReadAfterFold]>;
87158715
}
87168716
}
@@ -8798,7 +8798,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
87988798
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
87998799
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
88008800
"$src2, $src1", "$src1, $src2",
8801-
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
8801+
(OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2))>,
88028802
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
88038803
}
88048804
}
@@ -8977,7 +8977,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
89778977
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
89788978
"$src2, $src1", "$src1, $src2",
89798979
(X86fsqrts (_.VT _.RC:$src1),
8980-
_.ScalarIntMemCPat:$src2)>,
8980+
(_.ScalarIntMemFrags addr:$src2))>,
89818981
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
89828982
let Uses = [MXCSR] in
89838983
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -9050,7 +9050,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
90509050
OpcodeStr,
90519051
"$src3, $src2, $src1", "$src1, $src2, $src3",
90529052
(_.VT (X86RndScales _.RC:$src1,
9053-
_.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
9053+
(_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>,
90549054
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
90559055

90569056
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -10221,7 +10221,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
1022110221
(ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
1022210222
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
1022310223
(OpNode (_.VT _.RC:$src1),
10224-
(_.VT _.ScalarIntMemCPat:$src2),
10224+
(_.ScalarIntMemFrags addr:$src2),
1022510225
(i32 timm:$src3))>,
1022610226
Sched<[sched.Folded, sched.ReadAfterFold]>;
1022710227
}

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -789,23 +789,6 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
789789
SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
790790
]>;
791791

792-
//===----------------------------------------------------------------------===//
793-
// SSE Complex Patterns
794-
//===----------------------------------------------------------------------===//
795-
796-
// These are 'extloads' from a scalar to the low element of a vector, zeroing
797-
// the top elements. These are used for the SSE 'ss' and 'sd' instruction
798-
// forms.
799-
def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
800-
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
801-
SDNPWantRoot, SDNPWantParent]>;
802-
def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
803-
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
804-
SDNPWantRoot, SDNPWantParent]>;
805-
806-
def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
807-
def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
808-
809792
//===----------------------------------------------------------------------===//
810793
// SSE pattern fragments
811794
//===----------------------------------------------------------------------===//
@@ -976,6 +959,23 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
976959
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
977960
}]>;
978961

962+
// Scalar SSE intrinsic fragments to match several different types of loads.
963+
// Used by scalar SSE intrinsic instructions which have 128 bit types, but
964+
// only load a single element.
965+
// FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
966+
// the simple_load case.
967+
def sse_load_f32 : PatFrags<(ops node:$ptr),
968+
[(v4f32 (simple_load node:$ptr)),
969+
(v4f32 (X86vzload32 node:$ptr)),
970+
(v4f32 (scalar_to_vector (loadf32 node:$ptr)))]>;
971+
def sse_load_f64 : PatFrags<(ops node:$ptr),
972+
[(v2f64 (simple_load node:$ptr)),
973+
(v2f64 (X86vzload64 node:$ptr)),
974+
(v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
975+
976+
def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
977+
def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
978+
979979

980980
def fp32imm0 : PatLeaf<(f32 fpimm), [{
981981
return N->isExactlyValue(+0.0);

0 commit comments

Comments
 (0)