Skip to content

Commit 6fd11ba

Browse files
committed
[NVPTX] Optimize v2x16 BUILD_VECTORs to PRMT
When two 16-bit values are combined into a v2x16 vector, and those values are truncated come from 32-bit values, a PRMT instruction can save registers by selecting bytes directly from the original 32-bit values. We do this during a post-legalize DAG combine, as these opportunities are typically only exposed after the BUILD_VECTOR's operands have been legalized. Additionally, if the 32-bit values are right-shifted, we can fold in the shift by selecting higher bytes with PRMT. Only logical right-shifts by 16 are supported (for now) since those are the only situations seen in practice. Right shifts by 16 often come up during the legalization of EXTRACT_VECTOR_ELT.
1 parent 18be88e commit 6fd11ba

File tree

6 files changed

+95
-71
lines changed

6 files changed

+95
-71
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -762,7 +762,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
762762
// We have some custom DAG combine patterns for these nodes
763763
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
764764
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
765-
ISD::VSELECT});
765+
ISD::VSELECT, ISD::BUILD_VECTOR});
766766

767767
// setcc for f16x2 and bf16x2 needs special handling to prevent
768768
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6176,6 +6176,57 @@ static SDValue PerformLOADCombine(SDNode *N,
61766176
DL);
61776177
}
61786178

6179+
static SDValue
6180+
PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
6181+
auto VT = N->getValueType(0);
6182+
if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
6183+
return SDValue();
6184+
6185+
auto Op0 = N->getOperand(0);
6186+
auto Op1 = N->getOperand(1);
6187+
6188+
// Start out by assuming we want to take the lower 2 bytes of each i32
6189+
// operand.
6190+
uint64_t Op0Bytes = 0x10;
6191+
uint64_t Op1Bytes = 0x54;
6192+
6193+
std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6194+
{&Op1, &Op1Bytes}};
6195+
6196+
// Check that each operand is an i16, truncated from an i32 operand. We'll
6197+
// select individual bytes from those original operands. Optionally, fold in a
6198+
// shift right of that original operand.
6199+
for (auto &[Op, OpBytes] : OpData) {
6200+
// Eat up any bitcast
6201+
if (Op->getOpcode() == ISD::BITCAST)
6202+
*Op = Op->getOperand(0);
6203+
6204+
if (Op->getValueType() != MVT::i16 || Op->getOpcode() != ISD::TRUNCATE ||
6205+
Op->getOperand(0).getValueType() != MVT::i32)
6206+
return SDValue();
6207+
6208+
*Op = Op->getOperand(0);
6209+
6210+
// Optionally, fold in a shift-right of the original operand and permute
6211+
// the two higher bytes from the shifted operand
6212+
if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6213+
if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6214+
*OpBytes += 0x22;
6215+
*Op = Op->getOperand(0);
6216+
}
6217+
}
6218+
}
6219+
6220+
SDLoc DL(N);
6221+
auto &DAG = DCI.DAG;
6222+
6223+
auto PRMT = DAG.getNode(
6224+
NVPTXISD::PRMT, DL, MVT::v4i8,
6225+
{Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
6226+
DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
6227+
return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
6228+
}
6229+
61796230
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
61806231
DAGCombinerInfo &DCI) const {
61816232
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6210,6 +6261,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
62106261
return PerformEXTRACTCombine(N, DCI);
62116262
case ISD::VSELECT:
62126263
return PerformVSELECTCombine(N, DCI);
6264+
case ISD::BUILD_VECTOR:
6265+
return PerformBUILD_VECTORCombine(N, DCI);
62136266
}
62146267
return SDValue();
62156268
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
159159
; SM70-LABEL: test_faddx2(
160160
; SM70: {
161161
; SM70-NEXT: .reg .pred %p<3>;
162-
; SM70-NEXT: .reg .b16 %rs<13>;
163-
; SM70-NEXT: .reg .b32 %r<24>;
162+
; SM70-NEXT: .reg .b16 %rs<9>;
163+
; SM70-NEXT: .reg .b32 %r<25>;
164164
; SM70-NEXT: .reg .f32 %f<7>;
165165
; SM70-EMPTY:
166166
; SM70-NEXT: // %bb.0:
@@ -182,7 +182,6 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
182182
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
183183
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
184184
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
185-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
186185
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
187186
; SM70-NEXT: shl.b32 %r14, %r13, 16;
188187
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -197,8 +196,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
197196
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
198197
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
199198
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
200-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
201-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
199+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
202200
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
203201
; SM70-NEXT: ret;
204202
;
@@ -266,8 +264,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
266264
; SM70-LABEL: test_fsubx2(
267265
; SM70: {
268266
; SM70-NEXT: .reg .pred %p<3>;
269-
; SM70-NEXT: .reg .b16 %rs<13>;
270-
; SM70-NEXT: .reg .b32 %r<24>;
267+
; SM70-NEXT: .reg .b16 %rs<9>;
268+
; SM70-NEXT: .reg .b32 %r<25>;
271269
; SM70-NEXT: .reg .f32 %f<7>;
272270
; SM70-EMPTY:
273271
; SM70-NEXT: // %bb.0:
@@ -289,7 +287,6 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
289287
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
290288
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
291289
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
292-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
293290
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
294291
; SM70-NEXT: shl.b32 %r14, %r13, 16;
295292
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -304,8 +301,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
304301
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
305302
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
306303
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
307-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
308-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
304+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
309305
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
310306
; SM70-NEXT: ret;
311307
;
@@ -373,8 +369,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
373369
; SM70-LABEL: test_fmulx2(
374370
; SM70: {
375371
; SM70-NEXT: .reg .pred %p<3>;
376-
; SM70-NEXT: .reg .b16 %rs<13>;
377-
; SM70-NEXT: .reg .b32 %r<24>;
372+
; SM70-NEXT: .reg .b16 %rs<9>;
373+
; SM70-NEXT: .reg .b32 %r<25>;
378374
; SM70-NEXT: .reg .f32 %f<7>;
379375
; SM70-EMPTY:
380376
; SM70-NEXT: // %bb.0:
@@ -396,7 +392,6 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
396392
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
397393
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
398394
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
399-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
400395
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
401396
; SM70-NEXT: shl.b32 %r14, %r13, 16;
402397
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -411,8 +406,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
411406
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
412407
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
413408
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
414-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
415-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
409+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
416410
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
417411
; SM70-NEXT: ret;
418412
;
@@ -480,8 +474,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
480474
; SM70-LABEL: test_fdiv(
481475
; SM70: {
482476
; SM70-NEXT: .reg .pred %p<3>;
483-
; SM70-NEXT: .reg .b16 %rs<13>;
484-
; SM70-NEXT: .reg .b32 %r<24>;
477+
; SM70-NEXT: .reg .b16 %rs<9>;
478+
; SM70-NEXT: .reg .b32 %r<25>;
485479
; SM70-NEXT: .reg .f32 %f<7>;
486480
; SM70-EMPTY:
487481
; SM70-NEXT: // %bb.0:
@@ -503,7 +497,6 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
503497
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
504498
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
505499
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
506-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
507500
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
508501
; SM70-NEXT: shl.b32 %r14, %r13, 16;
509502
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -518,8 +511,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
518511
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
519512
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
520513
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
521-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
522-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
514+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
523515
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
524516
; SM70-NEXT: ret;
525517
;
@@ -1724,8 +1716,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17241716
; SM70-LABEL: test_maxnum_v2(
17251717
; SM70: {
17261718
; SM70-NEXT: .reg .pred %p<3>;
1727-
; SM70-NEXT: .reg .b16 %rs<13>;
1728-
; SM70-NEXT: .reg .b32 %r<24>;
1719+
; SM70-NEXT: .reg .b16 %rs<9>;
1720+
; SM70-NEXT: .reg .b32 %r<25>;
17291721
; SM70-NEXT: .reg .f32 %f<7>;
17301722
; SM70-EMPTY:
17311723
; SM70-NEXT: // %bb.0:
@@ -1747,7 +1739,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17471739
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
17481740
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
17491741
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
1750-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
17511742
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
17521743
; SM70-NEXT: shl.b32 %r14, %r13, 16;
17531744
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -1762,8 +1753,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17621753
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
17631754
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
17641755
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
1765-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
1766-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
1756+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
17671757
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
17681758
; SM70-NEXT: ret;
17691759
;

llvm/test/CodeGen/NVPTX/fma-relu-contract.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,8 +1050,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
10501050
; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
10511051
; CHECK-SM70: {
10521052
; CHECK-SM70-NEXT: .reg .pred %p<9>;
1053-
; CHECK-SM70-NEXT: .reg .b16 %rs<25>;
1054-
; CHECK-SM70-NEXT: .reg .b32 %r<61>;
1053+
; CHECK-SM70-NEXT: .reg .b16 %rs<21>;
1054+
; CHECK-SM70-NEXT: .reg .b32 %r<62>;
10551055
; CHECK-SM70-NEXT: .reg .f32 %f<19>;
10561056
; CHECK-SM70-EMPTY:
10571057
; CHECK-SM70-NEXT: // %bb.0:
@@ -1134,7 +1134,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
11341134
; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15;
11351135
; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304;
11361136
; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7;
1137-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r50; }
11381137
; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs17;
11391138
; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16;
11401139
; CHECK-SM70-NEXT: mov.b32 %f16, %r52;
@@ -1148,8 +1147,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
11481147
; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18;
11491148
; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304;
11501149
; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8;
1151-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs23}, %r59; }
1152-
; CHECK-SM70-NEXT: mov.b32 %r60, {%rs23, %rs20};
1150+
; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U;
11531151
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60;
11541152
; CHECK-SM70-NEXT: ret;
11551153
%1 = fmul <2 x bfloat> %a, %b
@@ -1189,8 +1187,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
11891187
; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
11901188
; CHECK-SM70: {
11911189
; CHECK-SM70-NEXT: .reg .pred %p<5>;
1192-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
1193-
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
1190+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
1191+
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
11941192
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
11951193
; CHECK-SM70-EMPTY:
11961194
; CHECK-SM70-NEXT: // %bb.0:
@@ -1244,7 +1242,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
12441242
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
12451243
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
12461244
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
1247-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
12481245
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
12491246
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
12501247
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
@@ -1255,8 +1252,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
12551252
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
12561253
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
12571254
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
1258-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
1259-
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
1255+
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
12601256
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
12611257
; CHECK-SM70-NEXT: ret;
12621258
%1 = fmul <2 x bfloat> %a, %b

llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -715,8 +715,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
715715
; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
716716
; CHECK-SM70: {
717717
; CHECK-SM70-NEXT: .reg .pred %p<7>;
718-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
719-
; CHECK-SM70-NEXT: .reg .b32 %r<57>;
718+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
719+
; CHECK-SM70-NEXT: .reg .b32 %r<58>;
720720
; CHECK-SM70-NEXT: .reg .f32 %f<17>;
721721
; CHECK-SM70-EMPTY:
722722
; CHECK-SM70-NEXT: // %bb.0:
@@ -790,7 +790,6 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
790790
; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f14, %f14;
791791
; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304;
792792
; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5;
793-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r48; }
794793
; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536;
795794
; CHECK-SM70-NEXT: mov.b32 %f15, %r49;
796795
; CHECK-SM70-NEXT: add.f32 %f16, %f15, %f9;
@@ -801,8 +800,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
801800
; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f16, %f16;
802801
; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304;
803802
; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6;
804-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r55; }
805-
; CHECK-SM70-NEXT: mov.b32 %r56, {%rs15, %rs13};
803+
; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U;
806804
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56;
807805
; CHECK-SM70-NEXT: ret;
808806
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -841,8 +839,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
841839
; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
842840
; CHECK-SM70: {
843841
; CHECK-SM70-NEXT: .reg .pred %p<5>;
844-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
845-
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
842+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
843+
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
846844
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
847845
; CHECK-SM70-EMPTY:
848846
; CHECK-SM70-NEXT: // %bb.0:
@@ -896,7 +894,6 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
896894
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
897895
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
898896
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
899-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
900897
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
901898
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
902899
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
@@ -907,8 +904,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
907904
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
908905
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
909906
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
910-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
911-
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
907+
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
912908
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
913909
; CHECK-SM70-NEXT: ret;
914910
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)

0 commit comments

Comments
 (0)