Skip to content

[RISCV] Add combine for shadd family of instructions. #130829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 31, 2025
Merged
73 changes: 72 additions & 1 deletion llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
Expand Down Expand Up @@ -79,6 +80,12 @@ static cl::opt<int>
"use for creating a floating-point immediate value"),
cl::init(2));

static cl::opt<bool>
ReassocShlAddiAdd("reassoc-shl-addi-add", cl::Hidden,
cl::desc("Swap add and addi in cases where the add may "
"be combined with a shift"),
cl::init(true));

RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
Expand Down Expand Up @@ -14306,6 +14313,67 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
}

// Check if this SDValue is an add immediate that is fed by a shift of 1, 2,
// or 3.
static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
SelectionDAG &DAG) {
using namespace llvm::SDPatternMatch;

// Looking for a reg-reg add and not an addi.
if (isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();

// Based on testing it seems that performance degrades if the ADDI has
// more than 2 uses.
if (AddI->use_size() > 2)
return SDValue();

APInt AddVal;
SDValue SHLVal;
if (!sd_match(AddI, m_Add(m_Value(SHLVal), m_ConstInt(AddVal))))
return SDValue();

APInt VShift;
if (!sd_match(SHLVal, m_BinOp(ISD::SHL, m_Value(), m_ConstInt(VShift))))
return SDValue();

if (VShift.slt(1) || VShift.sgt(3))
return SDValue();

SDLoc DL(N);
EVT VT = N->getValueType(0);
// The shift must be positive but the add can be signed.
uint64_t ShlConst = VShift.getZExtValue();
int64_t AddConst = AddVal.getSExtValue();

SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
DAG.getConstant(ShlConst, DL, VT), Other);
return DAG.getNode(ISD::ADD, DL, VT, SHADD,
DAG.getSignedConstant(AddConst, DL, VT));
}

// Optimize (add (add (shl x, c0), c1), y) ->
// (ADDI (SH*ADD y, x), c1), if c0 equals to [1|2|3].
static SDValue combineShlAddIAdd(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
// Perform this optimization only in the zba extension.
if (!ReassocShlAddiAdd || !Subtarget.hasStdExtZba())
return SDValue();

// Skip for vector types and larger types.
EVT VT = N->getValueType(0);
if (VT != Subtarget.getXLenVT())
return SDValue();

SDValue AddI = N->getOperand(0);
SDValue Other = N->getOperand(1);
if (SDValue V = combineShlAddIAddImpl(N, AddI, Other, DAG))
return V;
if (SDValue V = combineShlAddIAddImpl(N, Other, AddI, DAG))
return V;
return SDValue();
}

// Combine a constant select operand into its use:
//
// (and (select cond, -1, c), x)
Expand Down Expand Up @@ -14547,9 +14615,12 @@ static SDValue performADDCombine(SDNode *N,
return V;
if (SDValue V = transformAddImmMulImm(N, DAG, Subtarget))
return V;
if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer())
if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer()) {
if (SDValue V = transformAddShlImm(N, DAG, Subtarget))
return V;
if (SDValue V = combineShlAddIAdd(N, DAG, Subtarget))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't look at what transformAddShlImm() does, but it sounds like there is a possibility it would consume some SDAG pattern that you could optimize here. This probably doesn't require any changes but I just wanted to mention it to make sure you've considered the possible interactions and which one needs to run first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point.
I would say that running transformAddShlImm() first might provide a little advantage. For example, given that c0 and c1 cooperate:

(add (addi (add (shl x, c0), (shl y, c1)), c2), z)
-- > Run transformAddShlImm() 
(add (addi (shl (sh*add x, y) c0), c2, z)
--> Run transformAddShlImm()
(addi (sh*add (sh*add x, y), z), c2)

Running transformAddShlImm() first would not really help because it produces sh*add and addi which are not helpful for the previous transformation.

I do believe that this is the order in which they run.

return V;
}
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
Expand Down
189 changes: 189 additions & 0 deletions llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=riscv32-unknown-elf -mattr=+zba %s -o - | FileCheck %s

declare i32 @callee1(i32 noundef)
declare i32 @callee2(i32 noundef, i32 noundef)
declare i32 @callee(i32 noundef, i32 noundef, i32 noundef, i32 noundef)

define void @t1(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 {
; CHECK-LABEL: t1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh2add a2, a0, a2
; CHECK-NEXT: sh2add a1, a0, a1
; CHECK-NEXT: addi a1, a1, 45
; CHECK-NEXT: addi a2, a2, 45
; CHECK-NEXT: sh2add a3, a0, a3
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 45
%add1 = add nsw i32 %add, %b
%add3 = add nsw i32 %add, %c
%add5 = add nsw i32 %shl, %d
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5)
ret void
}

define void @t2(i32 noundef %a, i32 noundef %b, i32 noundef %c) #0 {
; CHECK-LABEL: t2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: addi a5, a0, 42
; CHECK-NEXT: add a4, a5, a1
; CHECK-NEXT: add a3, a5, a2
; CHECK-NEXT: mv a1, a5
; CHECK-NEXT: mv a2, a4
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add4 = add nsw i32 %add, %b
%add7 = add nsw i32 %add, %c
%call = tail call i32 @callee(i32 noundef %shl, i32 noundef %add, i32 noundef %add4, i32 noundef %add7)
ret void
}

define void @t3(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e) #0 {
; CHECK-LABEL: t3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: addi a5, a0, 42
; CHECK-NEXT: add a0, a5, a1
; CHECK-NEXT: add a1, a5, a2
; CHECK-NEXT: add a2, a5, a3
; CHECK-NEXT: add a3, a5, a4
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%add2 = add nsw i32 %add, %c
%add3 = add nsw i32 %add, %d
%add4 = add nsw i32 %add, %e
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add2, i32 noundef %add3, i32 noundef %add4)
ret void
}

define void @t4(i32 noundef %a, i32 noundef %b) #0 {
; CHECK-LABEL: t4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh2add a0, a0, a1
; CHECK-NEXT: addi a0, a0, 42
; CHECK-NEXT: tail callee1
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%call = tail call i32 @callee1(i32 noundef %add1)
ret void
}

define void @t5(i32 noundef %a, i32 noundef %b, i32 noundef %c) #0 {
; CHECK-LABEL: t5:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh2add a2, a0, a2
; CHECK-NEXT: sh2add a0, a0, a1
; CHECK-NEXT: addi a0, a0, 42
; CHECK-NEXT: addi a1, a2, 42
; CHECK-NEXT: tail callee2
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%add2 = add nsw i32 %add, %c
%call = tail call i32 @callee2(i32 noundef %add1, i32 noundef %add2)
ret void
}

define void @t6(i32 noundef %a, i32 noundef %b) #0 {
; CHECK-LABEL: t6:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: slli a2, a0, 2
; CHECK-NEXT: sh2add a0, a0, a1
; CHECK-NEXT: addi a0, a0, 42
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: mv a3, a2
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %shl, i32 noundef %shl, i32 noundef %shl)
ret void
}

define void @t7(i32 noundef %a, i32 noundef %b) #0 {
; CHECK-LABEL: t7:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: addi a2, a0, 42
; CHECK-NEXT: add a0, a2, a1
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: mv a3, a2
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add, i32 noundef %add, i32 noundef %add)
ret void
}

define void @t8(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 {
; CHECK-LABEL: t8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh3add a2, a0, a2
; CHECK-NEXT: sh3add a1, a0, a1
; CHECK-NEXT: lui a4, 1
; CHECK-NEXT: addi a4, a4, 1307
; CHECK-NEXT: add a1, a1, a4
; CHECK-NEXT: add a2, a2, a4
; CHECK-NEXT: sh3add a3, a0, a3
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: tail callee
Comment on lines +135 to +144
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that without this patch, this function would be:

        lui     a4, 1
        addi    a4, a4, 1307
        sh3add  a4, a0, a4
        add     a1, a4, a1
        add     a2, a4, a2
        sh3add  a3, a0, a3
        mv      a0, a1
        tail    callee

which is one instruction shorter than what you proposed here.

entry:
%shl = shl i32 %a, 3
%add = add nsw i32 %shl, 5403
%add1 = add nsw i32 %add, %b
%add3 = add nsw i32 %add, %c
%add5 = add nsw i32 %shl, %d
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5)
ret void
}

define void @t9(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 {
; CHECK-LABEL: t9:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sh2add a2, a0, a2
; CHECK-NEXT: sh2add a1, a0, a1
; CHECK-NEXT: addi a1, a1, -42
; CHECK-NEXT: addi a2, a2, -42
; CHECK-NEXT: sh2add a3, a0, a3
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: tail callee
Comment on lines +158 to +164
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, without this patch it would be:

        slli    a4, a0, 2
        addi    a4, a4, -42
        add     a1, a4, a1
        add     a2, a4, a2
        sh2add  a3, a0, a3
        mv      a0, a1
        tail    callee

Although they have the same number of instructions, is it always a good thing to use shXadd over plain ADD?

entry:
%shl = shl i32 %a, 2
%add = add nsw i32 %shl, -42
%add1 = add nsw i32 %add, %b
%add3 = add nsw i32 %add, %c
%add5 = add nsw i32 %shl, %d
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5)
ret void
}

define void @t10(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 {
; CHECK-LABEL: t10:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: tail callee
entry:
%shl = shl i32 %a, -2
%add = add nsw i32 %shl, 42
%add1 = add nsw i32 %add, %b
%add3 = add nsw i32 %add, %c
%add5 = add nsw i32 %shl, %d
%call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5)
ret void
}

attributes #0 = { nounwind optsize }