@@ -508,6 +508,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
508
508
setOperationAction (ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
509
509
setOperationAction (ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
510
510
511
+ // Conversion to/from i8/i8x4 is always legal.
511
512
setOperationAction (ISD::BUILD_VECTOR, MVT::v4i8, Custom);
512
513
setOperationAction (ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
513
514
setOperationAction (ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
@@ -717,8 +718,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
717
718
718
719
// We have some custom DAG combine patterns for these nodes
719
720
setTargetDAGCombine ({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
720
- ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM ,
721
- ISD::VSELECT});
721
+ ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::STORE ,
722
+ ISD::UREM, ISD:: VSELECT});
722
723
723
724
// setcc for f16x2 and bf16x2 needs special handling to prevent
724
725
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -2916,7 +2917,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2916
2917
DAG.getMemIntrinsicNode (Opcode, DL, DAG.getVTList (MVT::Other), Ops,
2917
2918
MemSD->getMemoryVT (), MemSD->getMemOperand ());
2918
2919
2919
- // return DCI.CombineTo(N, NewSt, true);
2920
2920
return NewSt;
2921
2921
}
2922
2922
@@ -5557,6 +5557,51 @@ static SDValue PerformLOADCombine(SDNode *N,
5557
5557
DL);
5558
5558
}
5559
5559
5560
+ // Lower a v16i8 (or a v8i8) store into a StoreV4 (or StoreV2) operation with
5561
+ // i32 results instead of letting ReplaceLoadVector split it into smaller stores
5562
+ // during legalization. This is done at dag-combine1 time, so that vector
5563
+ // operations with i8 elements can be optimised away instead of being needlessly
5564
+ // split during legalization, which involves storing to the stack and loading it
5565
+ // back.
5566
+ static SDValue PerformSTORECombine (SDNode *N,
5567
+ TargetLowering::DAGCombinerInfo &DCI) {
5568
+ SelectionDAG &DAG = DCI.DAG ;
5569
+ StoreSDNode *ST = cast<StoreSDNode>(N);
5570
+ EVT VT = ST->getValue ().getValueType ();
5571
+ if (VT != MVT::v16i8 && VT != MVT::v8i8)
5572
+ return SDValue ();
5573
+
5574
+ // Create a v4i32 vector store operation, effectively <4 x v4i8>.
5575
+ unsigned Opc = VT == MVT::v16i8 ? NVPTXISD::StoreV4 : NVPTXISD::StoreV2;
5576
+ EVT NewVT = VT == MVT::v16i8 ? MVT::v4i32 : MVT::v2i32;
5577
+ unsigned NumElts = NewVT.getVectorNumElements ();
5578
+
5579
+ // Create a vector of the type required by the new store: v16i8 -> v4i32.
5580
+ SDValue NewStoreValue = DCI.DAG .getBitcast (NewVT, ST->getValue ());
5581
+
5582
+ // Operands for the store.
5583
+ SmallVector<SDValue, 8 > Ops;
5584
+ Ops.reserve (N->getNumOperands () + NumElts - 1 );
5585
+ // Chain value.
5586
+ Ops.push_back (N->ops ().front ());
5587
+
5588
+ SDLoc DL (N);
5589
+ SmallVector<SDValue> Elts (NumElts);
5590
+ // Break v4i32 (or v2i32) into four (or two) elements.
5591
+ for (unsigned I = 0 ; I < NumElts; ++I)
5592
+ Elts[I] = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, DL,
5593
+ NewStoreValue.getValueType ().getVectorElementType (),
5594
+ NewStoreValue, DAG.getIntPtrConstant (I, DL));
5595
+ Ops.append (Elts.begin (), Elts.end ());
5596
+ // Any remaining operands.
5597
+ Ops.append (N->op_begin () + 2 , N->op_end ());
5598
+
5599
+ SDValue NewStore = DAG.getMemIntrinsicNode (Opc, DL, DAG.getVTList (MVT::Other),
5600
+ Ops, NewVT, ST->getMemOperand ());
5601
+ // Return the new chain.
5602
+ return NewStore.getValue (0 );
5603
+ }
5604
+
5560
5605
SDValue NVPTXTargetLowering::PerformDAGCombine (SDNode *N,
5561
5606
DAGCombinerInfo &DCI) const {
5562
5607
CodeGenOptLevel OptLevel = getTargetMachine ().getOptLevel ();
@@ -5578,6 +5623,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5578
5623
return PerformSETCCCombine (N, DCI, STI.getSmVersion ());
5579
5624
case ISD::LOAD:
5580
5625
return PerformLOADCombine (N, DCI);
5626
+ case ISD::STORE:
5627
+ return PerformSTORECombine (N, DCI);
5581
5628
case NVPTXISD::StoreRetval:
5582
5629
case NVPTXISD::StoreRetvalV2:
5583
5630
case NVPTXISD::StoreRetvalV4:
0 commit comments