-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Add combine for shadd family of instructions. #130829
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Add combine for shadd family of instructions. #130829
Conversation
For example for the following situation: %6:gpr = SLLI %2:gpr, 2 %7:gpr = ADDI killed %6:gpr, 24 %8:gpr = ADD %0:gpr, %7:gpr If we swap the two add instrucions we can merge the shift and add. The final code will look something like this: %7 = SH2ADD %0, %2 %8 = ADDI %7, 24
@llvm/pr-subscribers-backend-risc-v Author: Stefan Pintilie (stefanp-synopsys) ChangesFor example for the following situation: If we swap the two add instrucions we can merge the shift and add. The final code will look something like this: Full diff: https://github.com/llvm/llvm-project/pull/130829.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27a4bbce1f5fc..6334eab8c96ec 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -79,6 +79,12 @@ static cl::opt<int>
"use for creating a floating-point immediate value"),
cl::init(2));
+static cl::opt<bool>
+ ReassocShlAddiAdd("reassoc-shl-addi-add", cl::Hidden,
+ cl::desc("Swap add and addi in cases where the add may "
+ "be combined with a shift"),
+ cl::init(true));
+
RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -14306,6 +14312,87 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
}
+// Check if this SDValue is an add immediate and then
+static bool checkAddiForShift(SDValue AddI) {
+ // Based on testing it seems that performance degrades if the ADDI has
+ // more than 2 uses.
+ if (AddI->use_size() > 2)
+ return false;
+
+ ConstantSDNode *AddConst = dyn_cast<ConstantSDNode>(AddI->getOperand(1));
+ if (!AddConst)
+ return false;
+
+ SDValue SHLVal = AddI->getOperand(0);
+ if (SHLVal->getOpcode() != ISD::SHL)
+ return false;
+
+ ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(SHLVal->getOperand(1));
+ if (!ShiftNode)
+ return false;
+
+ auto ShiftVal = ShiftNode->getSExtValue();
+ if (ShiftVal != 1 && ShiftVal != 2 && ShiftVal != 3)
+ return false;
+
+ return true;
+}
+
+// Optimize (add (add (shl x, c0), c1), y) ->
+// (ADDI (SH*ADD y, x), c1), if c0 equals to [1|2|3].
+static SDValue combineShlAddIAdd(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (!ReassocShlAddiAdd)
+ return SDValue();
+
+ // Perform this optimization only in the zba extension.
+ if (!Subtarget.hasStdExtZba())
+ return SDValue();
+
+ // Skip for vector types and larger types.
+ EVT VT = N->getValueType(0);
+ if (VT.isVector() || VT.getSizeInBits() > Subtarget.getXLen())
+ return SDValue();
+
+ // Looking for a reg-reg add and not an addi.
+ auto *Op1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (Op1)
+ return SDValue();
+ SDValue AddI;
+ SDValue Other;
+
+ if (N->getOperand(0)->getOpcode() == ISD::ADD &&
+ N->getOperand(1)->getOpcode() == ISD::ADD) {
+ AddI = N->getOperand(0);
+ Other = N->getOperand(1);
+ if (!checkAddiForShift(AddI)) {
+ AddI = N->getOperand(1);
+ Other = N->getOperand(0);
+ }
+ } else if (N->getOperand(0)->getOpcode() == ISD::ADD) {
+ AddI = N->getOperand(0);
+ Other = N->getOperand(1);
+ } else if (N->getOperand(1)->getOpcode() == ISD::ADD) {
+ AddI = N->getOperand(1);
+ Other = N->getOperand(0);
+ } else
+ return SDValue();
+
+ if (!checkAddiForShift(AddI))
+ return SDValue();
+
+ auto *AddConst = dyn_cast<ConstantSDNode>(AddI->getOperand(1));
+ SDValue SHLVal = AddI->getOperand(0);
+ auto *ShiftNode = dyn_cast<ConstantSDNode>(SHLVal->getOperand(1));
+ auto ShiftVal = ShiftNode->getSExtValue();
+ SDLoc DL(N);
+
+ SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
+ DAG.getConstant(ShiftVal, DL, VT), Other);
+ return DAG.getNode(ISD::ADD, DL, VT, SHADD,
+ DAG.getConstant(AddConst->getSExtValue(), DL, VT));
+}
+
// Combine a constant select operand into its use:
//
// (and (select cond, -1, c), x)
@@ -14547,9 +14634,12 @@ static SDValue performADDCombine(SDNode *N,
return V;
if (SDValue V = transformAddImmMulImm(N, DAG, Subtarget))
return V;
- if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer())
+ if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer()) {
if (SDValue V = transformAddShlImm(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineShlAddIAdd(N, DAG, Subtarget))
+ return V;
+ }
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll b/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll
new file mode 100644
index 0000000000000..e1fa408706c4e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll
@@ -0,0 +1,113 @@
+; RUN: llc -mtriple=riscv32-pc-unknown-gnu -mattr=+zba %s -o - | FileCheck %s
+
+declare dso_local i32 @callee1(i32 noundef) local_unnamed_addr
+declare dso_local i32 @callee2(i32 noundef, i32 noundef) local_unnamed_addr
+declare dso_local i32 @callee(i32 noundef, i32 noundef, i32 noundef, i32 noundef) local_unnamed_addr
+
+; CHECK-LABEL: t1:
+; CHECK: sh2add
+; CHECK: sh2add
+; CHECK: sh2add
+; CHECK: tail callee
+
+define dso_local void @t1(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 45
+ %add1 = add nsw i32 %add, %b
+ %add3 = add nsw i32 %add, %c
+ %add5 = add nsw i32 %shl, %d
+ %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5)
+ ret void
+}
+
+; CHECK-LABEL: t2:
+; CHECK: slli
+; CHECK-NOT: sh2add
+; CHECK: tail callee
+
+define dso_local void @t2(i32 noundef %a, i32 noundef %b, i32 noundef %c) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add4 = add nsw i32 %add, %b
+ %add7 = add nsw i32 %add, %c
+ %call = tail call i32 @callee(i32 noundef %shl, i32 noundef %add, i32 noundef %add4, i32 noundef %add7)
+ ret void
+}
+
+; CHECK-LABEL: t3
+; CHECK slli
+; CHECK-NOT: sh2add
+; CHECK: tail callee
+
+define dso_local void @t3(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add1 = add nsw i32 %add, %b
+ %add2 = add nsw i32 %add, %c
+ %add3 = add nsw i32 %add, %d
+ %add4 = add nsw i32 %add, %e
+ %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add2, i32 noundef %add3, i32 noundef %add4)
+ ret void
+}
+
+; CHECK-LABEL: t4
+; CHECK: sh2add
+; CHECK-NEXT: addi
+; CHECK-NEXT: tail callee1
+
+define dso_local void @t4(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add1 = add nsw i32 %add, %b
+ %call = tail call i32 @callee1(i32 noundef %add1)
+ ret void
+}
+
+; CHECK-LABEL: t5
+; CHECK: sh2add
+; CHECK: sh2add
+; CHECK: tail callee2
+
+define dso_local void @t5(i32 noundef %a, i32 noundef %b, i32 noundef %c) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add1 = add nsw i32 %add, %b
+ %add2 = add nsw i32 %add, %c
+ %call = tail call i32 @callee2(i32 noundef %add1, i32 noundef %add2)
+ ret void
+}
+
+; CHECK-LABEL: t6
+; CHECK-DAG: sh2add
+; CHECK-DAG: slli
+; CHECK: tail callee
+
+define dso_local void @t6(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add1 = add nsw i32 %add, %b
+ %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %shl, i32 noundef %shl, i32 noundef %shl)
+ ret void
+}
+
+; CHECK-LABEL: t7
+; CHECK: slli
+; CHECK-NOT: sh2add
+; CHECK: tail callee
+
+define dso_local void @t7(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
+entry:
+ %shl = shl i32 %a, 2
+ %add = add nsw i32 %shl, 42
+ %add1 = add nsw i32 %add, %b
+ %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add, i32 noundef %add, i32 noundef %add)
+ ret void
+}
+
+attributes #0 = { nounwind optsize }
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice optimisation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Semantically, this LGTM but it can probably be simplified a bit.
SDValue AddI; | ||
SDValue Other; | ||
|
||
if (N->getOperand(0)->getOpcode() == ISD::ADD && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This whole conditional structure seems unnecessarily complicated. Can it not just be something like:
SDValue AddI = N->getOperand(0);
SDValue Other = N->getOperand(1);
bool LHSIsAdd = Addi.getOpcode() == ISD::ADD;
bool RHSIsAdd = Other.getOpcode() == ISD::ADD;
// If the RHS is the result of an add or both sides are results of an add, but the
// LHS does not have the desired structure with a shift, swap the operands.
if ((!LHSIsAdd && RHSIsAdd) || (LHSIsAdd && RHSIsAdd && !checkAddiForShift(AddI))
std::swap(AddI, Other);
// Now if either side is the result of an add, we simply need to ensure AddI has
// the desired structure.
if (!(LHSIsAdd || RHSIsAdd) || !checkAddiForShift(AddI))
return SDValue();
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or uses SDPatternMatch
SDValue Other;
SDValue AddI;
if (sd_match(N, m_Add(m_Value(Other), m_AllOf(m_Add(m_Value(), m_ConstantInt()),
m_Value(AddI)))))
...
If I remember correctly m_Add
is commutative by default, or you can uses m_c_BinOp(ISD::ADD)
to match commutative ADD explicitly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I just saw this after I updated the patch.
I have done what nemanjai suggested. However, let me also take a look at SDPatternMatch to see if it will help.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added the use of SDPatternMatch.
However, there is a bit of an issue. The file already makes use of PatternMatch which contains a lot of the same functions as SDPatternMatch. Calling these functions becomes ambiguous because the compiler doesn't know which version of the function to call. For example m_Value()
. As a result I cannot simply add:
using namespace llvm::PatternMatch;
using namespace llvm::SDPatternMatch;
To get this working I was forced to add SDPatternMatch::
in front of every function that is being used from that namespace and omit the using namespace
line. This creates quite a bit of clutter in the code and I'm not sure if it is worth it.
Thoughts? @mshockwave @nemanjai
Also, if we do decide to go with SDPatternMatch there is more I can do with it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add using namespace llvm::SDPatternMatch;
locally to the function that needs it?
if (SDValue V = transformAddShlImm(N, DAG, Subtarget)) | ||
return V; | ||
if (SDValue V = combineShlAddIAdd(N, DAG, Subtarget)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't look at what transformAddShlImm()
does, but it sounds like there is a possibility it would consume some SDAG pattern that you could optimize here. This probably doesn't require any changes but I just wanted to mention it to make sure you've considered the possible interactions and which one needs to run first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a good point.
I would say that running transformAddShlImm()
first might provide a little advantage. For example, given that c0 and c1 cooperate:
(add (addi (add (shl x, c0), (shl y, c1)), c2), z)
-- > Run transformAddShlImm()
(add (addi (shl (sh*add x, y) c0), c2, z)
--> Run transformAddShlImm()
(addi (sh*add (sh*add x, y), z), c2)
Running transformAddShlImm()
first would not really help because it produces sh*add
and addi
which are not helpful for the previous transformation.
I do believe that this is the order in which they run.
Also, cleaned up and auto generated the test case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but of course, please consider the SDPattern suggestion that another reviewer has suggested.
auto *AddConstNode = dyn_cast<ConstantSDNode>(AddI->getOperand(1)); | ||
if (!AddConstNode) | ||
return false; | ||
AddConst = AddConstNode->getSExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nit: please defer actually setting AddConst
and ShlConst
to the end, just before return true
. This way we're only modifying the output parameters if the query is successful.
int64_t AddConst; | ||
int64_t ShlConst; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nit: I suspect that at least for some compilers, this may produce "Variable used before it was set" warnings. Maybe initialize them to safe values (presumably zeros).
int64_t ShlConst; | ||
|
||
// At least one add is required. | ||
if (!(LHSIsAdd || RHSIsAdd)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!LHSIsAdd && !RHSIsAdd)
|
||
// If the LHS is not the result of an add or both sides are results of an add, but | ||
// the LHS does not have the desired structure with a shift, swap the operands. | ||
if (!LHSIsAdd || (LHSIsAdd && RHSIsAdd && !checkAddiForShift(AddI, AddConst, ShlConst))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The LHSIsAdd
isn't needed in (LHSIsAdd && RHSIsAdd && !checkAddiForShift(AddI, AddConst, ShlConst))
.
|
||
// Skip for vector types and larger types. | ||
EVT VT = N->getValueType(0); | ||
if (VT.isVector() || VT.getSizeInBits() > Subtarget.getXLen()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we just check if VT != Subtarget.getXLenVT()
? I believe the !DCI.isBeforeLegalize()
earlier guranteed that the only scalar type than can get here is XLenVT.
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
||
SDValue AddI = N->getOperand(0); | ||
SDValue Other = N->getOperand(1); | ||
bool LHSIsAddI = SDPatternMatch::sd_match( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can put a using namespace SDPatternMatch;
in this function body and drop all the prefixes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I wrote this before I saw your comment about why you had to prefix everything.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No worries!
Yes, I have the same issue if I move the using namespace SDPatternMatch;
into the function body.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removing "using namespace llvm::PatternMatch;" from the top of the file and adding "using namespace SDPatternMatch;" in this function worked for me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay! I have fixed the way I was doing SDPatternMatch and I think everything works now. Please take another look and let me know. @topperc @mshockwave
@@ -50,6 +51,7 @@ | |||
#include <optional> | |||
|
|||
using namespace llvm; | |||
using namespace llvm::PatternMatch; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you add this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh wow! I may have just create this problem for myself. Let me remove that line.
Thank you!
@@ -14325,10 +14325,11 @@ static bool checkAddiForShift(SDValue AddI, int64_t &AddConst, | |||
|
|||
APInt AddVal; | |||
SDValue SHLVal; | |||
sd_match(AddI, m_Add(m_Value(SHLVal), m_ConstInt(AddVal))); | |||
assert(sd_match(AddI, m_Add(m_Value(SHLVal), m_ConstInt(AddVal))) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This entire sd_match statement will not be executed if LLVM is not built with assertion enabled (e.g. normal Release build). I would recommend something like this:
bool IsAddI = sd_match(AddI, ...);
assert(IsAddI && "...");
(void)IsAddI;
… the major processing inside of it.
gentle ping I believe that I have resolved all of the comments on this issue. I will wait a couple more days for feedback and if I see nothing I will squash and merge. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
For example for the following situation: %6:gpr = SLLI %2:gpr, 2 %7:gpr = ADDI killed %6:gpr, 24 %8:gpr = ADD %0:gpr, %7:gpr If we swap the two add instrucions we can merge the shift and add. The final code will look something like this: %7 = SH2ADD %0, %2 %8 = ADDI %7, 24
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@stefanp-synopsys I left some comments w.r.t the profitability of this transformation. Also, could you clarify your motivation again? The example you gave in the patch description:
define i32 @foo(i32 %x, i32 %y) {
%1 = shl i32 %x, 2
%2 = add i32 %1, 24
%3 = add i32 %y, %2
ret i32 %3
}
Already yielded the code you desired:
sh2add a0, a0, a1
addi a0, a0, 24
even without this patch -- I believed it has been done through DAGCombiner's existing reassociate optimizations.
Your patch hinted that you want to use shXadd even when the ADDI is used more than one time (which was not covered by the reassociate optimization I mentioned). But function t8
in your test case doesn't seem to agree with it.
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: sh3add a2, a0, a2 | ||
; CHECK-NEXT: sh3add a1, a0, a1 | ||
; CHECK-NEXT: lui a4, 1 | ||
; CHECK-NEXT: addi a4, a4, 1307 | ||
; CHECK-NEXT: add a1, a1, a4 | ||
; CHECK-NEXT: add a2, a2, a4 | ||
; CHECK-NEXT: sh3add a3, a0, a3 | ||
; CHECK-NEXT: mv a0, a1 | ||
; CHECK-NEXT: tail callee |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed that without this patch, this function would be:
lui a4, 1
addi a4, a4, 1307
sh3add a4, a0, a4
add a1, a4, a1
add a2, a4, a2
sh3add a3, a0, a3
mv a0, a1
tail callee
which is one instruction shorter than what you proposed here.
; CHECK-NEXT: sh2add a2, a0, a2 | ||
; CHECK-NEXT: sh2add a1, a0, a1 | ||
; CHECK-NEXT: addi a1, a1, -42 | ||
; CHECK-NEXT: addi a2, a2, -42 | ||
; CHECK-NEXT: sh2add a3, a0, a3 | ||
; CHECK-NEXT: mv a0, a1 | ||
; CHECK-NEXT: tail callee |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similarly, without this patch it would be:
slli a4, a0, 2
addi a4, a4, -42
add a1, a4, a1
add a2, a4, a2
sh2add a3, a0, a3
mv a0, a1
tail callee
Although they have the same number of instructions, is it always a good thing to use shXadd over plain ADD?
Try to fix #130829 (review). There's no benefit if shl doesn't have one use.
…use (#143351) Try to fix llvm/llvm-project#130829 (review). There's no benefit if shl doesn't have one use.
…43351) Try to fix llvm#130829 (review). There's no benefit if shl doesn't have one use.
…43351) Try to fix llvm#130829 (review). There's no benefit if shl doesn't have one use.
…43351) Try to fix llvm#130829 (review). There's no benefit if shl doesn't have one use.
For example for the following situation:
%6:gpr = SLLI %2:gpr, 2
%7:gpr = ADDI killed %6:gpr, 24
%8:gpr = ADD %0:gpr, %7:gpr
If we swap the two add instrucions we can merge the shift and add. The final code will look something like this:
%7 = SH2ADD %0, %2
%8 = ADDI %7, 24