-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Lower i64 load/stores to ld/sd with Zilsd. #139808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Don't split i64 load/store when we have Zilsd. In the future, we should enhanced the LoadStoreOptimizer pass to do this, but this is a good starting point. Even if we support it in LoadStoreOptimizer, we might still want this for volatile loads/stores to guarantee the use of Zilsd.
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesDon't split i64 load/store when we have Zilsd. In the future, we should enhanced the LoadStoreOptimizer pass to do this, but this is a good starting point. Even if we support it in LoadStoreOptimizer, we might still want this for volatile loads/stores to guarantee the use of Zilsd. Full diff: https://github.com/llvm/llvm-project/pull/139808.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9db15ff25f979..33ec72437a50c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1626,6 +1626,51 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case RISCVISD::LD_RV32: {
+ assert(Subtarget->hasStdExtZilsd() && "LD_RV32 is only used with Zilsd");
+
+ SDValue Base, Offset;
+ SDValue Chain = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(1);
+ SelectAddrRegImm(Addr, Base, Offset);
+
+ SDValue Ops[] = {Base, Offset, Chain};
+ MachineSDNode *New = CurDAG->getMachineNode(
+ RISCV::LD_RV32, DL, {MVT::Untyped, MVT::Other}, Ops);
+ SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL,
+ MVT::i32, SDValue(New, 0));
+ SDValue Hi = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_odd, DL,
+ MVT::i32, SDValue(New, 0));
+ CurDAG->setNodeMemRefs(New, {cast<MemSDNode>(Node)->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), Lo);
+ ReplaceUses(SDValue(Node, 1), Hi);
+ ReplaceUses(SDValue(Node, 2), SDValue(New, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case RISCVISD::SD_RV32: {
+ SDValue Base, Offset;
+ SDValue Chain = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(3);
+ SelectAddrRegImm(Addr, Base, Offset);
+
+ SDValue Ops[] = {
+ CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32),
+ Node->getOperand(1),
+ CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32),
+ Node->getOperand(2),
+ CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)};
+
+ SDNode *RegPair = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+ MVT::Untyped, Ops);
+ MachineSDNode *New =
+ CurDAG->getMachineNode(RISCV::SD_RV32, DL, MVT::Other,
+ {SDValue(RegPair, 0), Base, Offset, Chain});
+ CurDAG->setNodeMemRefs(New, {cast<MemSDNode>(Node)->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(New, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntNo = Node->getConstantOperandVal(0);
switch (IntNo) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c01496c9a7f3a..bbb3f30f15fb5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -318,6 +318,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
!(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
+ if (Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit()) {
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
+ }
+
if (Subtarget.is64Bit()) {
setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
@@ -7748,13 +7753,33 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::STORE: {
auto *Store = cast<StoreSDNode>(Op);
SDValue StoredVal = Store->getValue();
- EVT VecTy = StoredVal.getValueType();
+ EVT VT = StoredVal.getValueType();
+ if (VT == MVT::i64) {
+ assert(Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ if (Store->isTruncatingStore())
+ return SDValue();
+
+ if (!Subtarget.enableUnalignedScalarMem() && Store->getAlign() < 8)
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, StoredVal,
+ DAG.getTargetConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, StoredVal,
+ DAG.getTargetConstant(1, DL, MVT::i32));
+
+ return DAG.getMemIntrinsicNode(
+ RISCVISD::SD_RV32, DL, DAG.getVTList(MVT::Other),
+ {Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64,
+ Store->getMemOperand());
+ }
// Handle normal vector tuple store.
- if (VecTy.isRISCVVectorTuple()) {
+ if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
MVT XLenVT = Subtarget.getXLenVT();
- unsigned NF = VecTy.getRISCVVectorTupleNumFields();
- unsigned Sz = VecTy.getSizeInBits().getKnownMinValue();
+ unsigned NF = VT.getRISCVVectorTupleNumFields();
+ unsigned Sz = VT.getSizeInBits().getKnownMinValue();
unsigned NumElts = Sz / (NF * 8);
int Log2LMUL = Log2_64(NumElts) - 3;
@@ -13714,6 +13739,28 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
// sext_inreg we emit for ADD/SUB/MUL/SLLI.
LoadSDNode *Ld = cast<LoadSDNode>(N);
+ if (N->getValueType(0) == MVT::i64) {
+ assert(Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+
+ if (!Subtarget.enableUnalignedScalarMem() && Ld->getAlign() < 8)
+ return;
+
+ SDLoc dl(N);
+ SDValue Result = DAG.getMemIntrinsicNode(
+ RISCVISD::LD_RV32, dl,
+ DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
+ {Ld->getChain(), Ld->getBasePtr()}, MVT::i64, Ld->getMemOperand());
+ SDValue Lo = Result.getValue(0);
+ SDValue Hi = Result.getValue(1);
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ Results.append({Pair, Result.getValue(2)});
+ return;
+ }
+
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+
SDLoc dl(N);
SDValue Res = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Ld->getChain(),
Ld->getBasePtr(), Ld->getMemoryVT(),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td
index 3e526273c0768..a3203f288b545 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td
@@ -11,6 +11,20 @@
//
//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCV_LD_RV32
+ : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisPtrTy<2>]>;
+def SDT_RISCV_SD_RV32
+ : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisPtrTy<2>]>;
+
+def riscv_ld_rv32 : RVSDNode<"LD_RV32", SDT_RISCV_LD_RV32,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def riscv_st_rv32 : RVSDNode<"SD_RV32", SDT_RISCV_SD_RV32,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
//===----------------------------------------------------------------------===//
// Instruction Class Templates
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/zilsd.ll b/llvm/test/CodeGen/RISCV/zilsd.ll
new file mode 100644
index 0000000000000..236ba8adc0487
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zilsd.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+zilsd -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=CHECK,SLOW %s
+; RUN: llc -mtriple=riscv32 -mattr=+zilsd,+unaligned-scalar-mem -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=CHECK,FAST %s
+
+define i64 @load(ptr %a) nounwind {
+; CHECK-LABEL: load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld a2, 80(a0)
+; CHECK-NEXT: ld a0, 0(a0)
+; CHECK-NEXT: mv a0, a2
+; CHECK-NEXT: mv a1, a3
+; CHECK-NEXT: ret
+ %1 = getelementptr i64, ptr %a, i32 10
+ %2 = load i64, ptr %1
+ %3 = load volatile i64, ptr %a
+ ret i64 %2
+}
+
+define void @store(ptr %a, i64 %b) nounwind {
+; CHECK-LABEL: store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: sd a2, 0(a0)
+; CHECK-NEXT: sd a2, 88(a0)
+; CHECK-NEXT: ret
+ store i64 %b, ptr %a
+ %1 = getelementptr i64, ptr %a, i32 11
+ store i64 %b, ptr %1
+ ret void
+}
+
+define i64 @load_unaligned(ptr %p) {
+; SLOW-LABEL: load_unaligned:
+; SLOW: # %bb.0:
+; SLOW-NEXT: lbu a1, 1(a0)
+; SLOW-NEXT: lbu a2, 2(a0)
+; SLOW-NEXT: lbu a3, 3(a0)
+; SLOW-NEXT: lbu a4, 0(a0)
+; SLOW-NEXT: slli a1, a1, 8
+; SLOW-NEXT: slli a2, a2, 16
+; SLOW-NEXT: slli a3, a3, 24
+; SLOW-NEXT: or a1, a1, a4
+; SLOW-NEXT: lbu a4, 4(a0)
+; SLOW-NEXT: lbu a5, 5(a0)
+; SLOW-NEXT: or a2, a3, a2
+; SLOW-NEXT: lbu a3, 6(a0)
+; SLOW-NEXT: lbu a0, 7(a0)
+; SLOW-NEXT: slli a5, a5, 8
+; SLOW-NEXT: or a4, a5, a4
+; SLOW-NEXT: slli a3, a3, 16
+; SLOW-NEXT: slli a0, a0, 24
+; SLOW-NEXT: or a3, a0, a3
+; SLOW-NEXT: or a0, a2, a1
+; SLOW-NEXT: or a1, a3, a4
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: load_unaligned:
+; FAST: # %bb.0:
+; FAST-NEXT: ld a0, 0(a0)
+; FAST-NEXT: ret
+ %res = load i64, ptr %p, align 1
+ ret i64 %res
+}
+
+define void @store_unaligned(ptr %p, i64 %v) {
+; SLOW-LABEL: store_unaligned:
+; SLOW: # %bb.0:
+; SLOW-NEXT: srli a3, a2, 24
+; SLOW-NEXT: srli a4, a2, 16
+; SLOW-NEXT: srli a5, a2, 8
+; SLOW-NEXT: srli a6, a1, 24
+; SLOW-NEXT: srli a7, a1, 16
+; SLOW-NEXT: sb a2, 4(a0)
+; SLOW-NEXT: sb a5, 5(a0)
+; SLOW-NEXT: sb a4, 6(a0)
+; SLOW-NEXT: sb a3, 7(a0)
+; SLOW-NEXT: srli a2, a1, 8
+; SLOW-NEXT: sb a1, 0(a0)
+; SLOW-NEXT: sb a2, 1(a0)
+; SLOW-NEXT: sb a7, 2(a0)
+; SLOW-NEXT: sb a6, 3(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_unaligned:
+; FAST: # %bb.0:
+; FAST-NEXT: mv a3, a2
+; FAST-NEXT: mv a2, a1
+; FAST-NEXT: sd a2, 0(a0)
+; FAST-NEXT: ret
+ store i64 %v, ptr %p, align 1
+ ret void
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A handful of questions, the only one that is blocking is about extending loads, the others are not vital.
; CHECK-NEXT: ld a2, 80(a0) | ||
; CHECK-NEXT: ld a0, 0(a0) | ||
; CHECK-NEXT: mv a0, a2 | ||
; CHECK-NEXT: mv a1, a3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is correct, but do we need to adjust the spill weights to get something like
mv a5, a0
ld a0, 80(a5)
ld a2, 0(a5)
; CHECK-NEXT: mv a3, a2 | ||
; CHECK-NEXT: mv a2, a1 | ||
; CHECK-NEXT: sd a2, 0(a0) | ||
; CHECK-NEXT: sd a2, 88(a0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is correct and the moves are necessary due to the order of the parameters (the i64 isn't passed in an even-odd pair)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
✅ With the latest revision this PR passed the C/C++ code formatter. |
For reference, on Arm (both 32-bit and 64-bit), we use a merged load/store for volatile operations, but non-volatile operations are optimized in post-isel passes. It's not necessarily bad to merge early, but it's harder to tell if merging is profitable. |
the draft release notes still only mention assembler support, but this seems to be adding limited codegen. |
Don't split i64 load/store when we have Zilsd.
In the future, we should enhanced the LoadStoreOptimizer pass to do this, but this is a good starting point. Even if we support it in LoadStoreOptimizer, we might still want this for volatile loads/stores to guarantee the use of Zilsd.