@@ -204,7 +204,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204
204
;
205
205
; SM80-LABEL: test_faddx2(
206
206
; SM80: {
207
- ; SM80-NEXT: .reg .b16 %rs<7 >;
207
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
208
208
; SM80-NEXT: .reg .b32 %r<4>;
209
209
; SM80-NEXT: .reg .f32 %f<7>;
210
210
; SM80-EMPTY:
@@ -216,18 +216,16 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
216
216
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217
217
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
218
218
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220
219
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221
220
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
222
221
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
222
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
225
223
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
226
224
; SM80-NEXT: ret;
227
225
;
228
226
; SM80-FTZ-LABEL: test_faddx2(
229
227
; SM80-FTZ: {
230
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
228
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
231
229
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232
230
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
233
231
; SM80-FTZ-EMPTY:
@@ -239,12 +237,10 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
239
237
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240
238
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
241
239
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243
240
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244
241
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
245
242
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
243
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
248
244
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249
245
; SM80-FTZ-NEXT: ret;
250
246
;
@@ -311,7 +307,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
311
307
;
312
308
; SM80-LABEL: test_fsubx2(
313
309
; SM80: {
314
- ; SM80-NEXT: .reg .b16 %rs<7 >;
310
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
315
311
; SM80-NEXT: .reg .b32 %r<4>;
316
312
; SM80-NEXT: .reg .f32 %f<7>;
317
313
; SM80-EMPTY:
@@ -323,18 +319,16 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
323
319
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324
320
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
325
321
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327
322
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328
323
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
329
324
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
325
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
332
326
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
333
327
; SM80-NEXT: ret;
334
328
;
335
329
; SM80-FTZ-LABEL: test_fsubx2(
336
330
; SM80-FTZ: {
337
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
331
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
338
332
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339
333
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
340
334
; SM80-FTZ-EMPTY:
@@ -346,12 +340,10 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
346
340
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347
341
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
348
342
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350
343
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351
344
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
352
345
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
346
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
355
347
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356
348
; SM80-FTZ-NEXT: ret;
357
349
;
@@ -418,7 +410,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
418
410
;
419
411
; SM80-LABEL: test_fmulx2(
420
412
; SM80: {
421
- ; SM80-NEXT: .reg .b16 %rs<7 >;
413
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
422
414
; SM80-NEXT: .reg .b32 %r<4>;
423
415
; SM80-NEXT: .reg .f32 %f<7>;
424
416
; SM80-EMPTY:
@@ -430,18 +422,16 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
430
422
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431
423
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
432
424
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434
425
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435
426
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
436
427
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
428
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
439
429
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
440
430
; SM80-NEXT: ret;
441
431
;
442
432
; SM80-FTZ-LABEL: test_fmulx2(
443
433
; SM80-FTZ: {
444
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
434
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
445
435
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446
436
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
447
437
; SM80-FTZ-EMPTY:
@@ -453,12 +443,10 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
453
443
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454
444
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
455
445
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457
446
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458
447
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
459
448
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
449
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
462
450
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463
451
; SM80-FTZ-NEXT: ret;
464
452
;
@@ -525,7 +513,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525
513
;
526
514
; SM80-LABEL: test_fdiv(
527
515
; SM80: {
528
- ; SM80-NEXT: .reg .b16 %rs<7 >;
516
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
529
517
; SM80-NEXT: .reg .b32 %r<4>;
530
518
; SM80-NEXT: .reg .f32 %f<7>;
531
519
; SM80-EMPTY:
@@ -537,18 +525,16 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
537
525
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538
526
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
539
527
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541
528
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542
529
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
543
530
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
531
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
546
532
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
547
533
; SM80-NEXT: ret;
548
534
;
549
535
; SM80-FTZ-LABEL: test_fdiv(
550
536
; SM80-FTZ: {
551
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
537
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
552
538
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553
539
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
554
540
; SM80-FTZ-EMPTY:
@@ -560,18 +546,16 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
560
546
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561
547
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
562
548
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564
549
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565
550
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
566
551
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
552
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
569
553
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570
554
; SM80-FTZ-NEXT: ret;
571
555
;
572
556
; SM90-LABEL: test_fdiv(
573
557
; SM90: {
574
- ; SM90-NEXT: .reg .b16 %rs<7 >;
558
+ ; SM90-NEXT: .reg .b16 %rs<5 >;
575
559
; SM90-NEXT: .reg .b32 %r<4>;
576
560
; SM90-NEXT: .reg .f32 %f<7>;
577
561
; SM90-EMPTY:
@@ -583,12 +567,10 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
583
567
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584
568
; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
585
569
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586
- ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587
570
; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588
571
; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
589
572
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590
- ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591
- ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
573
+ ; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
592
574
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
593
575
; SM90-NEXT: ret;
594
576
%r = fdiv <2 x bfloat> %a , %b
0 commit comments