Skip to content

[NVPTX] Improve 64bit FSH/ROT lowering when shift amount is constant #131371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryFence(N))
return;
break;
case NVPTXISD::UNPACK_VECTOR:
tryUNPACK_VECTOR(N);
return;
case ISD::EXTRACT_VECTOR_ELT:
if (tryEXTRACT_VECTOR_ELEMENT(N))
return;
Expand Down Expand Up @@ -445,6 +448,17 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
return true;
}

bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
SDValue Vector = N->getOperand(0);
MVT EltVT = N->getSimpleValueType(0);

MachineSDNode *N2 =
CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);

ReplaceNode(N, N2);
return true;
}

// Find all instances of extract_vector_elt that use this v2f16 vector
// and coalesce them into a scattering move instruction.
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
bool tryUNPACK_VECTOR(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
Expand Down
69 changes: 68 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include <iterator>
#include <optional>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -668,8 +669,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
Expand);

if (STI.hasHWROT32())
if (STI.hasHWROT32()) {
setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);
setOperationAction({ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, MVT::i64,
Custom);
}

setOperationAction(ISD::BSWAP, MVT::i16, Expand);

Expand Down Expand Up @@ -1056,6 +1060,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::StoreRetvalV2)
MAKE_CASE(NVPTXISD::StoreRetvalV4)
MAKE_CASE(NVPTXISD::PseudoUseParam)
MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
MAKE_CASE(NVPTXISD::BUILD_VECTOR)
MAKE_CASE(NVPTXISD::RETURN)
MAKE_CASE(NVPTXISD::CallSeqBegin)
MAKE_CASE(NVPTXISD::CallSeqEnd)
Expand Down Expand Up @@ -2758,6 +2764,61 @@ static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
}

static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL,
unsigned Opcode, SelectionDAG &DAG) {
assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);

const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
if (!AmtConst)
return SDValue();
const auto Amt = AmtConst->getZExtValue() & 63;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The shift argument is treated as an unsigned amount modulo the element size of the arguments. (source)

Interesting semantics. I would've guessed it's a bug to specify a number larger than the element size.


SDValue UnpackA =
DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
SDValue UnpackB =
DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);

// Arch is Little endiain: 0 = low bits, 1 = high bits
SDValue ALo = UnpackA.getValue(0);
SDValue AHi = UnpackA.getValue(1);
SDValue BLo = UnpackB.getValue(0);
SDValue BHi = UnpackB.getValue(1);

// The bitfeild consists of { AHi : ALo : BHi : BLo }
//
// * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
// * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
// * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
// * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
//
// Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
// are not needed at all. Amt = 0 is a no-op producing either A or B depending
// on the direction. Amt = 32 can be implemented by a packing and unpacking
// move to select and arrange the 32bit values. For simplicity, these cases
// are not handled here explicitly and instead we rely on DAGCombiner to
// remove the no-op funnel shifts we insert.
auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
? std::make_tuple(AHi, ALo, BHi)
: std::make_tuple(ALo, BHi, BLo);

SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});

return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
}

static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) {
return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
SDLoc(Op), Op->getOpcode(), DAG);
}

static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
SDLoc(Op), Opcode, DAG);
}

SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
Expand Down Expand Up @@ -2818,6 +2879,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVAARG(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
case ISD::FSHL:
case ISD::FSHR:
return lowerFSH(Op, DAG);
case ISD::ROTL:
case ISD::ROTR:
return lowerROT(Op, DAG);
case ISD::ABS:
case ISD::SMIN:
case ISD::SMAX:
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,17 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,

/// This node is similar to ISD::BUILD_VECTOR except that the output may be
/// implicitly bitcast to a scalar. This allows for the representation of
/// packing move instructions for vector types which are not legal i.e. v2i32
BUILD_VECTOR,

/// This node is the inverse of NVPTX::BUILD_VECTOR. It takes a single value
/// which may be a scalar and unpacks it into multiple values by implicitly
/// converting it to a vector.
UNPACK_VECTOR,

FCOPYSIGN,
DYNAMIC_STACKALLOC,
STACKRESTORE,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3222,6 +3222,12 @@ def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;


def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>;

def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)),
(V2I32toI64 $a, $b)>;

//
// Funnel-Shift
//
Expand Down
Loading