-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Improve 64bit FSH/ROT lowering when shift amount is constant #131371
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Improve 64bit FSH/ROT lowering when shift amount is constant #131371
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesWhen the sift amount of a 64-bit funnel-shift or rotate is constant, it may be decomposed into two 32-bit funnel-sifts. This ensures that we recover any possible performance losses associated with the correctness fix in a131fbf. Patch is 22.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131371.diff 7 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4ce8c508c5f2b..f2757c5e49b33 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -113,6 +113,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryFence(N))
return;
break;
+ case NVPTXISD::UNPACK_VECTOR:
+ tryUNPACK_VECTOR(N);
+ return;
case ISD::EXTRACT_VECTOR_ELT:
if (tryEXTRACT_VECTOR_ELEMENT(N))
return;
@@ -445,6 +448,17 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
return true;
}
+bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
+ SDValue Vector = N->getOperand(0);
+ MVT EltVT = N->getSimpleValueType(0);
+
+ MachineSDNode *N2 =
+ CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
+
+ ReplaceNode(N, N2);
+ return true;
+}
+
// Find all instances of extract_vector_elt that use this v2f16 vector
// and coalesce them into a scattering move instruction.
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 42891b8ca8d8d..23cbd458571a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -88,6 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
+ bool tryUNPACK_VECTOR(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b768725b04256..d44ba72ff98c9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -66,6 +66,7 @@
#include <iterator>
#include <optional>
#include <string>
+#include <tuple>
#include <utility>
#include <vector>
@@ -668,8 +669,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
Expand);
- if (STI.hasHWROT32())
+ if (STI.hasHWROT32()) {
setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);
+ setOperationAction({ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, MVT::i64,
+ Custom);
+ }
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
@@ -1056,6 +1060,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::StoreRetvalV2)
MAKE_CASE(NVPTXISD::StoreRetvalV4)
MAKE_CASE(NVPTXISD::PseudoUseParam)
+ MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
+ MAKE_CASE(NVPTXISD::BUILD_VECTOR)
MAKE_CASE(NVPTXISD::RETURN)
MAKE_CASE(NVPTXISD::CallSeqBegin)
MAKE_CASE(NVPTXISD::CallSeqEnd)
@@ -2758,6 +2764,53 @@ static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
}
+static SDValue expandFSH64(SDValue A, SDValue B, SDValue AmtVal, SDLoc DL,
+ unsigned Opcode, SelectionDAG &DAG) {
+ assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
+
+ const auto *AmtConst = dyn_cast<ConstantSDNode>(AmtVal);
+ if (!AmtConst)
+ return SDValue();
+ const auto Amt = AmtConst->getZExtValue() & 63;
+
+ SDValue UnpackA =
+ DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
+ SDValue UnpackB =
+ DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
+
+ // Arch is Little endiain: 0 = low bits, 1 = high bits
+ SDValue ALo = UnpackA.getValue(0);
+ SDValue AHi = UnpackA.getValue(1);
+ SDValue BLo = UnpackB.getValue(0);
+ SDValue BHi = UnpackB.getValue(1);
+
+ // The bitfeild consists of { AHi : ALo : BHi : BLo }
+ // FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
+ // FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
+ // FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
+ // FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
+ auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
+ ? std::make_tuple(AHi, ALo, BHi)
+ : std::make_tuple(ALo, BHi, BLo);
+
+ SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
+ SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
+ SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
+
+ return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
+}
+
+static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) {
+ return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
+ SDLoc(Op), Op->getOpcode(), DAG);
+}
+
+static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
+ unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
+ return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
+ SDLoc(Op), Opcode, DAG);
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2818,6 +2871,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVAARG(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
+ case ISD::FSHL:
+ case ISD::FSHR:
+ return lowerFSH(Op, DAG);
+ case ISD::ROTL:
+ case ISD::ROTR:
+ return lowerROT(Op, DAG);
case ISD::ABS:
case ISD::SMIN:
case ISD::SMAX:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index ff0241886223b..152fe253eeed9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,8 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,
+ UNPACK_VECTOR,
+ BUILD_VECTOR,
FCOPYSIGN,
DYNAMIC_STACKALLOC,
STACKRESTORE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3c88551d7b23c..83509b1078c57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3222,6 +3222,12 @@ def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;
+
+def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>;
+
+def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)),
+ (V2I32toI64 $a, $b)>;
+
//
// Funnel-Shift
//
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 4174fd2f3ec2c..5e684953fe5d5 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -198,34 +198,94 @@ define i64 @rotl64(i64 %a, i64 %n) {
}
; SM35: rotl64_imm
-define i64 @rotl64_imm(i64 %a) {
-; SM20-LABEL: rotl64_imm(
+define i64 @rotl64_low_imm(i64 %a) {
+; SM20-LABEL: rotl64_low_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0];
; SM20-NEXT: shr.u64 %rd2, %rd1, 62;
; SM20-NEXT: shl.b64 %rd3, %rd1, 2;
; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
; SM20-NEXT: ret;
;
-; SM35-LABEL: rotl64_imm(
+; SM35-LABEL: rotl64_low_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM35-NEXT: shr.u64 %rd2, %rd1, 62;
-; SM35-NEXT: shl.b64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 2;
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 2;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}
+define i64 @rotl64_high_imm(i64 %a) {
+; SM20-LABEL: rotl64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM20-NEXT: shr.u64 %rd2, %rd1, 1;
+; SM20-NEXT: shl.b64 %rd3, %rd1, 63;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r2, %r1, 31;
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, 31;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
+ ret i64 %val
+}
+
+define i64 @rotl64_32_imm(i64 %a) {
+; SM20-LABEL: rotl64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM20-NEXT: shr.u64 %rd2, %rd1, 32;
+; SM20-NEXT: shl.b64 %rd3, %rd1, 32;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: mov.b64 %rd2, {%r2, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 32)
+ ret i64 %val
+}
+
; SM35: rotr64
define i64 @rotr64(i64 %a, i64 %n) {
; SM20-LABEL: rotr64(
@@ -266,31 +326,91 @@ define i64 @rotr64(i64 %a, i64 %n) {
}
; SM35: rotr64_imm
-define i64 @rotr64_imm(i64 %a) {
-; SM20-LABEL: rotr64_imm(
+define i64 @rotr64_low_imm(i64 %a) {
+; SM20-LABEL: rotr64_low_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM20-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM20-NEXT: shr.u64 %rd3, %rd1, 2;
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 52;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 12;
; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
; SM20-NEXT: ret;
;
-; SM35-LABEL: rotr64_imm(
+; SM35-LABEL: rotr64_low_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM35-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM35-NEXT: shr.u64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 12;
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 12;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
; SM35-NEXT: ret;
- %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 12)
+ ret i64 %val
+}
+
+define i64 @rotr64_high_imm(i64 %a) {
+; SM20-LABEL: rotr64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 21;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 43;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.r.wrap.b32 %r3, %r1, %r2, 11;
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, 11;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 43)
+ ret i64 %val
+}
+
+define i64 @rotr64_32_imm(i64 %a) {
+; SM20-LABEL: rotr64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 32;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: mov.b64 %rd2, {%r2, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 32)
ret i64 %val
}
@@ -446,3 +566,194 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
ret i64 %val
}
+define i64 @fshl64_low_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_low_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 59;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 5;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_low_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 5;
+; SM35-NEXT: shf.l.wrap.b32 %r6, %r1, %r2, 5;
+; SM35-NEXT: mov.b64 %rd3, {%r5, %r6};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 5)
+ ret i64 %val
+}
+
+define i64 @fshl64_high_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 9;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 55;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 23;
+; SM35-NEXT: shf.l.wrap.b32 %r6, %r3, %r4, 23;
+; SM35-NEXT: mov.b64 %rd3, {%r6, %r5};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 55)
+ ret i64 %val
+}
+
+define i64 @fshl64_32_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: ld.param.u32 %rd3, [fshl64_32_imm_param_1+4];
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_32_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: mov.b64 %rd3, {%r4, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 32)
+ ret i64 %val
+}
+
+define i64 @fshr64_low_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_low_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 31;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 33;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_low_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 31;
+; SM35-NEXT: shf.r.wrap.b32 %r6, %r3, %r4, 31;
+; SM35-NEXT: mov.b64 %rd3, {%r6, %r5};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 31)
+ ret i64 %val
+}
+
+define i64 @fshr64_high_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 33;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 31;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 1;
+; SM35-NEXT: shf.r.wrap.b32 %r6, %r1, %r2, 1;
+; SM35-NEXT: mov.b64 %rd3, {%r5, %r6};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 33)
+ ret i64 %val
+}
+
+define i64 @fshr64_32_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: ld.param.u32 %rd3, [fshr64_32_imm_param_1+4];
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_32_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: mov.b64 %rd3, {%r4, %r1};
+; SM35-NEXT: st.param.b64 [func_r...
[truncated]
|
48df919
to
57b9c81
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add an additional section to your PR description where you describe why we need new NVPTXISD
nodes to handle this case and under what conditions we can begin using ISD
nodes and perhaps even port this to LegalizeDAG
?
Also, which test case is the regression?
const auto *AmtConst = dyn_cast<ConstantSDNode>(AmtVal); | ||
if (!AmtConst) | ||
return SDValue(); | ||
const auto Amt = AmtConst->getZExtValue() & 63; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The shift argument is treated as an unsigned amount modulo the element size of the arguments. (source)
Interesting semantics. I would've guessed it's a bug to specify a number larger than the element size.
Added, also added a comment in the code.
Every test case that changed show improvement. For example, in llvm/test/CodeGen/NVPTX/rotate_64.ll, |
Good point, I forgot that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a couple of nits.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/1634 Here is the relevant piece of the build log for the reference
|
When the sift amount of a 64-bit funnel-shift or rotate is constant, it may be decomposed into two 32-bit funnel-sifts. This ensures that we recover any possible performance losses associated with the correctness fix in a131fbf.
In order to efficiently represent the expansion with Selection DAG nodes, NVPTXISD::BUILD_VECTOR and NVPTXISD::UNPACK_VECTOR are added which allow the vector output/input to be represented as a scalar. In the future, if we add support for the v2i32 type to the NVPTX backend these nodes may be removed.