Skip to content

[NVPTX] Optimize v2x16 BUILD_VECTORs to PRMT #116675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
ISD::VSELECT});
ISD::VSELECT, ISD::BUILD_VECTOR});

// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
Expand Down Expand Up @@ -6184,6 +6184,66 @@ static SDValue PerformLOADCombine(SDNode *N,
DL);
}

static SDValue
PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
auto VT = N->getValueType(0);
if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
return SDValue();

auto Op0 = N->getOperand(0);
auto Op1 = N->getOperand(1);

// Start out by assuming we want to take the lower 2 bytes of each i32
// operand.
uint64_t Op0Bytes = 0x10;
uint64_t Op1Bytes = 0x54;

std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
{&Op1, &Op1Bytes}};

// Check that each operand is an i16, truncated from an i32 operand. We'll
// select individual bytes from those original operands. Optionally, fold in a
// shift right of that original operand.
for (auto &[Op, OpBytes] : OpData) {
// Eat up any bitcast
if (Op->getOpcode() == ISD::BITCAST)
*Op = Op->getOperand(0);

if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
Op->getOperand(0).getValueType() == MVT::i32))
return SDValue();

// If the truncate has multiple uses, this optimization can increase
// register pressure
if (!Op->hasOneUse())
return SDValue();

*Op = Op->getOperand(0);

// Optionally, fold in a shift-right of the original operand and let permute
// pick the two higher bytes of the original value directly.
if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
// Shift the PRMT byte selector to pick upper bytes from each respective
// value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
"PRMT selector values out of range");
*OpBytes += 0x22;
*Op = Op->getOperand(0);
}
}
}

SDLoc DL(N);
auto &DAG = DCI.DAG;

auto PRMT = DAG.getNode(
NVPTXISD::PRMT, DL, MVT::v4i8,
{Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
}

SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
Expand Down Expand Up @@ -6218,6 +6278,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformEXTRACTCombine(N, DCI);
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
case ISD::BUILD_VECTOR:
return PerformBUILD_VECTORCombine(N, DCI);
}
return SDValue();
}
Expand Down
40 changes: 15 additions & 25 deletions llvm/test/CodeGen/NVPTX/bf16-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-LABEL: test_faddx2(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<13>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<25>;
; SM70-NEXT: .reg .f32 %f<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
Expand All @@ -182,7 +182,6 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f4, %r14;
Expand All @@ -197,8 +196,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -262,8 +260,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-LABEL: test_fsubx2(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<13>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<25>;
; SM70-NEXT: .reg .f32 %f<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
Expand All @@ -285,7 +283,6 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f4, %r14;
Expand All @@ -300,8 +297,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -365,8 +361,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-LABEL: test_fmulx2(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<13>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<25>;
; SM70-NEXT: .reg .f32 %f<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
Expand All @@ -388,7 +384,6 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f4, %r14;
Expand All @@ -403,8 +398,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -468,8 +462,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-LABEL: test_fdiv(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<13>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<25>;
; SM70-NEXT: .reg .f32 %f<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
Expand All @@ -491,7 +485,6 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f4, %r14;
Expand All @@ -506,8 +499,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
Expand Down Expand Up @@ -1706,8 +1698,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70-LABEL: test_maxnum_v2(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<13>;
; SM70-NEXT: .reg .b32 %r<24>;
; SM70-NEXT: .reg .b16 %rs<9>;
; SM70-NEXT: .reg .b32 %r<25>;
; SM70-NEXT: .reg .f32 %f<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
Expand All @@ -1729,7 +1721,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f4, %r14;
Expand All @@ -1744,8 +1735,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1046,8 +1046,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<9>;
; CHECK-SM70-NEXT: .reg .b16 %rs<25>;
; CHECK-SM70-NEXT: .reg .b32 %r<61>;
; CHECK-SM70-NEXT: .reg .b16 %rs<21>;
; CHECK-SM70-NEXT: .reg .b32 %r<62>;
; CHECK-SM70-NEXT: .reg .f32 %f<19>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
Expand Down Expand Up @@ -1130,7 +1130,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15;
; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r50; }
; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs17;
; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16;
; CHECK-SM70-NEXT: mov.b32 %f16, %r52;
Expand All @@ -1144,8 +1143,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18;
; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs23}, %r59; }
; CHECK-SM70-NEXT: mov.b32 %r60, {%rs23, %rs20};
Comment on lines -1147 to -1148
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we now able to delete the code that generates this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so because it's still used for (trunc (srl s, 16)) or (extractelt $vec, 1). Perhaps if we generally matched both of those to PRMTs we could remove the code, but I suspect we'll always need the option to fall back to these patterns.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(trunc (srl s, 16)) or (extractelt $vec, 1)

Those should be partially converted to prmt, too. The part that moves bits in multiples of 8 to the LSB of i32 maps to permute, and trunc would just be a regular truncating move. Does not have to be done in this patch, but if the change is trivial, it may fit here, too. Up to you.

; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U;
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
Expand Down Expand Up @@ -1185,8 +1183,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
Expand Down Expand Up @@ -1240,7 +1238,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
Expand All @@ -1251,8 +1248,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<7>;
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
; CHECK-SM70-NEXT: .reg .b32 %r<57>;
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
; CHECK-SM70-NEXT: .reg .b32 %r<58>;
; CHECK-SM70-NEXT: .reg .f32 %f<17>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
Expand Down Expand Up @@ -786,7 +786,6 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f14, %f14;
; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r48; }
; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536;
; CHECK-SM70-NEXT: mov.b32 %f15, %r49;
; CHECK-SM70-NEXT: add.f32 %f16, %f15, %f9;
Expand All @@ -797,8 +796,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f16, %f16;
; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r55; }
; CHECK-SM70-NEXT: mov.b32 %r56, {%rs15, %rs13};
; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U;
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56;
; CHECK-SM70-NEXT: ret;
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
Expand Down Expand Up @@ -837,8 +835,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
Expand Down Expand Up @@ -892,7 +890,6 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
Expand All @@ -903,8 +900,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
Expand Down
Loading
Loading