Skip to content

Commit a1f5fe8

Browse files
authored
[NVPTX] Optimize v2x16 BUILD_VECTORs to PRMT (llvm#116675)
When two 16-bit values are combined into a v2x16 vector, and those values are truncated come from 32-bit values, a PRMT instruction can save registers by selecting bytes directly from the original 32-bit values. We do this during a post-legalize DAG combine, as these opportunities are typically only exposed after the BUILD_VECTOR's operands have been legalized. Additionally, if the 32-bit values are right-shifted, we can fold in the shift by selecting higher bytes with PRMT. Only logical right-shifts by 16 are supported (for now) since those are the only situations seen in practice. Right shifts by 16 often come up during the legalization of EXTRACT_VECTOR_ELT. This idea was brought up in a PR comment by @Artem-B.
1 parent 7c135e1 commit a1f5fe8

File tree

6 files changed

+169
-71
lines changed

6 files changed

+169
-71
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
767767
// We have some custom DAG combine patterns for these nodes
768768
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
769769
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
770-
ISD::VSELECT});
770+
ISD::VSELECT, ISD::BUILD_VECTOR});
771771

772772
// setcc for f16x2 and bf16x2 needs special handling to prevent
773773
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5120,6 +5120,66 @@ static SDValue PerformLOADCombine(SDNode *N,
51205120
DL);
51215121
}
51225122

5123+
static SDValue
5124+
PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
5125+
auto VT = N->getValueType(0);
5126+
if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
5127+
return SDValue();
5128+
5129+
auto Op0 = N->getOperand(0);
5130+
auto Op1 = N->getOperand(1);
5131+
5132+
// Start out by assuming we want to take the lower 2 bytes of each i32
5133+
// operand.
5134+
uint64_t Op0Bytes = 0x10;
5135+
uint64_t Op1Bytes = 0x54;
5136+
5137+
std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5138+
{&Op1, &Op1Bytes}};
5139+
5140+
// Check that each operand is an i16, truncated from an i32 operand. We'll
5141+
// select individual bytes from those original operands. Optionally, fold in a
5142+
// shift right of that original operand.
5143+
for (auto &[Op, OpBytes] : OpData) {
5144+
// Eat up any bitcast
5145+
if (Op->getOpcode() == ISD::BITCAST)
5146+
*Op = Op->getOperand(0);
5147+
5148+
if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
5149+
Op->getOperand(0).getValueType() == MVT::i32))
5150+
return SDValue();
5151+
5152+
// If the truncate has multiple uses, this optimization can increase
5153+
// register pressure
5154+
if (!Op->hasOneUse())
5155+
return SDValue();
5156+
5157+
*Op = Op->getOperand(0);
5158+
5159+
// Optionally, fold in a shift-right of the original operand and let permute
5160+
// pick the two higher bytes of the original value directly.
5161+
if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5162+
if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5163+
// Shift the PRMT byte selector to pick upper bytes from each respective
5164+
// value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5165+
assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5166+
"PRMT selector values out of range");
5167+
*OpBytes += 0x22;
5168+
*Op = Op->getOperand(0);
5169+
}
5170+
}
5171+
}
5172+
5173+
SDLoc DL(N);
5174+
auto &DAG = DCI.DAG;
5175+
5176+
auto PRMT = DAG.getNode(
5177+
NVPTXISD::PRMT, DL, MVT::v4i8,
5178+
{Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
5179+
DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
5180+
return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
5181+
}
5182+
51235183
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
51245184
DAGCombinerInfo &DCI) const {
51255185
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5154,6 +5214,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
51545214
return PerformEXTRACTCombine(N, DCI);
51555215
case ISD::VSELECT:
51565216
return PerformVSELECTCombine(N, DCI);
5217+
case ISD::BUILD_VECTOR:
5218+
return PerformBUILD_VECTORCombine(N, DCI);
51575219
}
51585220
return SDValue();
51595221
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
159159
; SM70-LABEL: test_faddx2(
160160
; SM70: {
161161
; SM70-NEXT: .reg .pred %p<3>;
162-
; SM70-NEXT: .reg .b16 %rs<13>;
163-
; SM70-NEXT: .reg .b32 %r<24>;
162+
; SM70-NEXT: .reg .b16 %rs<9>;
163+
; SM70-NEXT: .reg .b32 %r<25>;
164164
; SM70-NEXT: .reg .f32 %f<7>;
165165
; SM70-EMPTY:
166166
; SM70-NEXT: // %bb.0:
@@ -182,7 +182,6 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
182182
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
183183
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
184184
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
185-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
186185
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
187186
; SM70-NEXT: shl.b32 %r14, %r13, 16;
188187
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -197,8 +196,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
197196
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
198197
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
199198
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
200-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
201-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
199+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
202200
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
203201
; SM70-NEXT: ret;
204202
;
@@ -262,8 +260,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
262260
; SM70-LABEL: test_fsubx2(
263261
; SM70: {
264262
; SM70-NEXT: .reg .pred %p<3>;
265-
; SM70-NEXT: .reg .b16 %rs<13>;
266-
; SM70-NEXT: .reg .b32 %r<24>;
263+
; SM70-NEXT: .reg .b16 %rs<9>;
264+
; SM70-NEXT: .reg .b32 %r<25>;
267265
; SM70-NEXT: .reg .f32 %f<7>;
268266
; SM70-EMPTY:
269267
; SM70-NEXT: // %bb.0:
@@ -285,7 +283,6 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
285283
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
286284
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
287285
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
288-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
289286
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
290287
; SM70-NEXT: shl.b32 %r14, %r13, 16;
291288
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -300,8 +297,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
300297
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
301298
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
302299
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
303-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
304-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
300+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
305301
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
306302
; SM70-NEXT: ret;
307303
;
@@ -365,8 +361,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
365361
; SM70-LABEL: test_fmulx2(
366362
; SM70: {
367363
; SM70-NEXT: .reg .pred %p<3>;
368-
; SM70-NEXT: .reg .b16 %rs<13>;
369-
; SM70-NEXT: .reg .b32 %r<24>;
364+
; SM70-NEXT: .reg .b16 %rs<9>;
365+
; SM70-NEXT: .reg .b32 %r<25>;
370366
; SM70-NEXT: .reg .f32 %f<7>;
371367
; SM70-EMPTY:
372368
; SM70-NEXT: // %bb.0:
@@ -388,7 +384,6 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
388384
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
389385
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
390386
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
391-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
392387
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
393388
; SM70-NEXT: shl.b32 %r14, %r13, 16;
394389
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -403,8 +398,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
403398
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
404399
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
405400
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
406-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
407-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
401+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
408402
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
409403
; SM70-NEXT: ret;
410404
;
@@ -468,8 +462,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
468462
; SM70-LABEL: test_fdiv(
469463
; SM70: {
470464
; SM70-NEXT: .reg .pred %p<3>;
471-
; SM70-NEXT: .reg .b16 %rs<13>;
472-
; SM70-NEXT: .reg .b32 %r<24>;
465+
; SM70-NEXT: .reg .b16 %rs<9>;
466+
; SM70-NEXT: .reg .b32 %r<25>;
473467
; SM70-NEXT: .reg .f32 %f<7>;
474468
; SM70-EMPTY:
475469
; SM70-NEXT: // %bb.0:
@@ -491,7 +485,6 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
491485
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
492486
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
493487
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
494-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
495488
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
496489
; SM70-NEXT: shl.b32 %r14, %r13, 16;
497490
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -506,8 +499,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
506499
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
507500
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
508501
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
509-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
510-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
502+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
511503
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
512504
; SM70-NEXT: ret;
513505
;
@@ -1706,8 +1698,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17061698
; SM70-LABEL: test_maxnum_v2(
17071699
; SM70: {
17081700
; SM70-NEXT: .reg .pred %p<3>;
1709-
; SM70-NEXT: .reg .b16 %rs<13>;
1710-
; SM70-NEXT: .reg .b32 %r<24>;
1701+
; SM70-NEXT: .reg .b16 %rs<9>;
1702+
; SM70-NEXT: .reg .b32 %r<25>;
17111703
; SM70-NEXT: .reg .f32 %f<7>;
17121704
; SM70-EMPTY:
17131705
; SM70-NEXT: // %bb.0:
@@ -1729,7 +1721,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17291721
; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
17301722
; SM70-NEXT: or.b32 %r11, %r7, 4194304;
17311723
; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
1732-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
17331724
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
17341725
; SM70-NEXT: shl.b32 %r14, %r13, 16;
17351726
; SM70-NEXT: mov.b32 %f4, %r14;
@@ -1744,8 +1735,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
17441735
; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
17451736
; SM70-NEXT: or.b32 %r21, %r17, 4194304;
17461737
; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
1747-
; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
1748-
; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
1738+
; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
17491739
; SM70-NEXT: st.param.b32 [func_retval0], %r23;
17501740
; SM70-NEXT: ret;
17511741
;

llvm/test/CodeGen/NVPTX/fma-relu-contract.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,8 +1046,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
10461046
; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
10471047
; CHECK-SM70: {
10481048
; CHECK-SM70-NEXT: .reg .pred %p<9>;
1049-
; CHECK-SM70-NEXT: .reg .b16 %rs<25>;
1050-
; CHECK-SM70-NEXT: .reg .b32 %r<61>;
1049+
; CHECK-SM70-NEXT: .reg .b16 %rs<21>;
1050+
; CHECK-SM70-NEXT: .reg .b32 %r<62>;
10511051
; CHECK-SM70-NEXT: .reg .f32 %f<19>;
10521052
; CHECK-SM70-EMPTY:
10531053
; CHECK-SM70-NEXT: // %bb.0:
@@ -1130,7 +1130,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
11301130
; CHECK-SM70-NEXT: setp.nan.f32 %p7, %f15, %f15;
11311131
; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304;
11321132
; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7;
1133-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r50; }
11341133
; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs17;
11351134
; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16;
11361135
; CHECK-SM70-NEXT: mov.b32 %f16, %r52;
@@ -1144,8 +1143,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
11441143
; CHECK-SM70-NEXT: setp.nan.f32 %p8, %f18, %f18;
11451144
; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304;
11461145
; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8;
1147-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs23}, %r59; }
1148-
; CHECK-SM70-NEXT: mov.b32 %r60, {%rs23, %rs20};
1146+
; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U;
11491147
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60;
11501148
; CHECK-SM70-NEXT: ret;
11511149
%1 = fmul <2 x bfloat> %a, %b
@@ -1185,8 +1183,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
11851183
; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
11861184
; CHECK-SM70: {
11871185
; CHECK-SM70-NEXT: .reg .pred %p<5>;
1188-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
1189-
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
1186+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
1187+
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
11901188
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
11911189
; CHECK-SM70-EMPTY:
11921190
; CHECK-SM70-NEXT: // %bb.0:
@@ -1240,7 +1238,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
12401238
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
12411239
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
12421240
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
1243-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
12441241
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
12451242
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
12461243
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
@@ -1251,8 +1248,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
12511248
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
12521249
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
12531250
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
1254-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
1255-
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
1251+
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
12561252
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
12571253
; CHECK-SM70-NEXT: ret;
12581254
%1 = fmul <2 x bfloat> %a, %b

llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -711,8 +711,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
711711
; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
712712
; CHECK-SM70: {
713713
; CHECK-SM70-NEXT: .reg .pred %p<7>;
714-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
715-
; CHECK-SM70-NEXT: .reg .b32 %r<57>;
714+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
715+
; CHECK-SM70-NEXT: .reg .b32 %r<58>;
716716
; CHECK-SM70-NEXT: .reg .f32 %f<17>;
717717
; CHECK-SM70-EMPTY:
718718
; CHECK-SM70-NEXT: // %bb.0:
@@ -786,7 +786,6 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
786786
; CHECK-SM70-NEXT: setp.nan.f32 %p5, %f14, %f14;
787787
; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304;
788788
; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5;
789-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r48; }
790789
; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536;
791790
; CHECK-SM70-NEXT: mov.b32 %f15, %r49;
792791
; CHECK-SM70-NEXT: add.f32 %f16, %f15, %f9;
@@ -797,8 +796,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
797796
; CHECK-SM70-NEXT: setp.nan.f32 %p6, %f16, %f16;
798797
; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304;
799798
; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6;
800-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r55; }
801-
; CHECK-SM70-NEXT: mov.b32 %r56, {%rs15, %rs13};
799+
; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U;
802800
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56;
803801
; CHECK-SM70-NEXT: ret;
804802
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -837,8 +835,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
837835
; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
838836
; CHECK-SM70: {
839837
; CHECK-SM70-NEXT: .reg .pred %p<5>;
840-
; CHECK-SM70-NEXT: .reg .b16 %rs<17>;
841-
; CHECK-SM70-NEXT: .reg .b32 %r<43>;
838+
; CHECK-SM70-NEXT: .reg .b16 %rs<13>;
839+
; CHECK-SM70-NEXT: .reg .b32 %r<44>;
842840
; CHECK-SM70-NEXT: .reg .f32 %f<13>;
843841
; CHECK-SM70-EMPTY:
844842
; CHECK-SM70-NEXT: // %bb.0:
@@ -892,7 +890,6 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
892890
; CHECK-SM70-NEXT: setp.nan.f32 %p3, %f10, %f10;
893891
; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
894892
; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
895-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
896893
; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
897894
; CHECK-SM70-NEXT: mov.b32 %f11, %r35;
898895
; CHECK-SM70-NEXT: max.f32 %f12, %f11, 0f00000000;
@@ -903,8 +900,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
903900
; CHECK-SM70-NEXT: setp.nan.f32 %p4, %f12, %f12;
904901
; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
905902
; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
906-
; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
907-
; CHECK-SM70-NEXT: mov.b32 %r42, {%rs15, %rs13};
903+
; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
908904
; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
909905
; CHECK-SM70-NEXT: ret;
910906
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)

0 commit comments

Comments
 (0)