Skip to content

Revert "[NVPTX] Lower 16xi8 and 8xi8 stores efficiently (#73646)" #74518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 3 additions & 50 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);

// Conversion to/from i8/i8x4 is always legal.
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
Expand Down Expand Up @@ -718,8 +717,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,

// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::STORE,
ISD::UREM, ISD::VSELECT});
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
ISD::VSELECT});

// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
Expand Down Expand Up @@ -2917,6 +2916,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
MemSD->getMemoryVT(), MemSD->getMemOperand());

// return DCI.CombineTo(N, NewSt, true);
return NewSt;
}

Expand Down Expand Up @@ -5557,51 +5557,6 @@ static SDValue PerformLOADCombine(SDNode *N,
DL);
}

// Lower a v16i8 (or a v8i8) store into a StoreV4 (or StoreV2) operation with
// i32 results instead of letting ReplaceLoadVector split it into smaller stores
// during legalization. This is done at dag-combine1 time, so that vector
// operations with i8 elements can be optimised away instead of being needlessly
// split during legalization, which involves storing to the stack and loading it
// back.
static SDValue PerformSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
StoreSDNode *ST = cast<StoreSDNode>(N);
EVT VT = ST->getValue().getValueType();
if (VT != MVT::v16i8 && VT != MVT::v8i8)
return SDValue();

// Create a v4i32 vector store operation, effectively <4 x v4i8>.
unsigned Opc = VT == MVT::v16i8 ? NVPTXISD::StoreV4 : NVPTXISD::StoreV2;
EVT NewVT = VT == MVT::v16i8 ? MVT::v4i32 : MVT::v2i32;
unsigned NumElts = NewVT.getVectorNumElements();

// Create a vector of the type required by the new store: v16i8 -> v4i32.
SDValue NewStoreValue = DCI.DAG.getBitcast(NewVT, ST->getValue());

// Operands for the store.
SmallVector<SDValue, 8> Ops;
Ops.reserve(N->getNumOperands() + NumElts - 1);
// Chain value.
Ops.push_back(N->ops().front());

SDLoc DL(N);
SmallVector<SDValue> Elts(NumElts);
// Break v4i32 (or v2i32) into four (or two) elements.
for (unsigned I = 0; I < NumElts; ++I)
Elts[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
NewStoreValue.getValueType().getVectorElementType(),
NewStoreValue, DAG.getIntPtrConstant(I, DL));
Ops.append(Elts.begin(), Elts.end());
// Any remaining operands.
Ops.append(N->op_begin() + 2, N->op_end());

SDValue NewStore = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
Ops, NewVT, ST->getMemOperand());
// Return the new chain.
return NewStore.getValue(0);
}

SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
Expand All @@ -5623,8 +5578,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
return PerformLOADCombine(N, DCI);
case ISD::STORE:
return PerformSTORECombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -790,9 +790,10 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
; CHECK-NEXT: ld.u32 %r1, [%rd1+4];
; CHECK-NEXT: ld.u32 %r2, [%rd1];
; CHECK-NEXT: st.v2.u32 [%rd2], {%r2, %r1};
; CHECK-NEXT: ld.u32 %r1, [%rd1];
; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
; CHECK-NEXT: st.u32 [%rd2+4], %r2;
; CHECK-NEXT: st.u32 [%rd2], %r1;
; CHECK-NEXT: ret;
%t1 = load <8 x i8>, ptr %a
store <8 x i8> %t1, ptr %b, align 16
Expand Down
16 changes: 0 additions & 16 deletions llvm/test/CodeGen/NVPTX/vector-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,3 @@ define void @v16i8(ptr %a, ptr %b) {
store <16 x i8> %v, ptr %b
ret void
}

; CHECK-LABEL: .visible .func v16i8_store
define void @v16i8_store(ptr %a, <16 x i8> %v) {
; CHECK: ld.param.u64 %rd1, [v16i8_store_param_0];
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [v16i8_store_param_1];
; CHECK-NEXT: st.v4.u32 [%rd1], {%r1, %r2, %r3, %r4};
store <16 x i8> %v, ptr %a
ret void
}

; CHECK-LABEL: .visible .func v8i8_store
define void @v8i8_store(ptr %a, <8 x i8> %v) {
; CHECK: st.v2.u32
store <8 x i8> %v, ptr %a
ret void
}