@@ -204,7 +204,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204
204
;
205
205
; SM80-LABEL: test_faddx2(
206
206
; SM80: {
207
- ; SM80-NEXT: .reg .b16 %rs<5 >;
207
+ ; SM80-NEXT: .reg .b16 %rs<7 >;
208
208
; SM80-NEXT: .reg .b32 %r<4>;
209
209
; SM80-NEXT: .reg .f32 %f<7>;
210
210
; SM80-EMPTY:
@@ -216,16 +216,18 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
216
216
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217
217
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
218
218
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
219
220
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
220
221
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
221
222
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
222
- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
223
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224
+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
223
225
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
224
226
; SM80-NEXT: ret;
225
227
;
226
228
; SM80-FTZ-LABEL: test_faddx2(
227
229
; SM80-FTZ: {
228
- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
230
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
229
231
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
230
232
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
231
233
; SM80-FTZ-EMPTY:
@@ -237,10 +239,12 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
237
239
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
238
240
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
239
241
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
240
243
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
241
244
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
242
245
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
243
- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
246
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247
+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
244
248
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
245
249
; SM80-FTZ-NEXT: ret;
246
250
;
@@ -307,7 +311,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
307
311
;
308
312
; SM80-LABEL: test_fsubx2(
309
313
; SM80: {
310
- ; SM80-NEXT: .reg .b16 %rs<5 >;
314
+ ; SM80-NEXT: .reg .b16 %rs<7 >;
311
315
; SM80-NEXT: .reg .b32 %r<4>;
312
316
; SM80-NEXT: .reg .f32 %f<7>;
313
317
; SM80-EMPTY:
@@ -319,16 +323,18 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
319
323
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
320
324
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
321
325
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
322
327
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
323
328
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
324
329
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
325
- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
330
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331
+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
326
332
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
327
333
; SM80-NEXT: ret;
328
334
;
329
335
; SM80-FTZ-LABEL: test_fsubx2(
330
336
; SM80-FTZ: {
331
- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
337
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
332
338
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
333
339
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
334
340
; SM80-FTZ-EMPTY:
@@ -340,10 +346,12 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
340
346
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
341
347
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
342
348
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
343
350
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
344
351
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
345
352
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
346
- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
353
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354
+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
347
355
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
348
356
; SM80-FTZ-NEXT: ret;
349
357
;
@@ -410,7 +418,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
410
418
;
411
419
; SM80-LABEL: test_fmulx2(
412
420
; SM80: {
413
- ; SM80-NEXT: .reg .b16 %rs<5 >;
421
+ ; SM80-NEXT: .reg .b16 %rs<7 >;
414
422
; SM80-NEXT: .reg .b32 %r<4>;
415
423
; SM80-NEXT: .reg .f32 %f<7>;
416
424
; SM80-EMPTY:
@@ -422,16 +430,18 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
422
430
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
423
431
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
424
432
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
425
434
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
426
435
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
427
436
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
428
- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
437
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438
+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
429
439
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
430
440
; SM80-NEXT: ret;
431
441
;
432
442
; SM80-FTZ-LABEL: test_fmulx2(
433
443
; SM80-FTZ: {
434
- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
444
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
435
445
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
436
446
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
437
447
; SM80-FTZ-EMPTY:
@@ -443,10 +453,12 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
443
453
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
444
454
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
445
455
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
446
457
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
447
458
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
448
459
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
449
- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
460
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461
+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
450
462
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
451
463
; SM80-FTZ-NEXT: ret;
452
464
;
@@ -513,7 +525,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
513
525
;
514
526
; SM80-LABEL: test_fdiv(
515
527
; SM80: {
516
- ; SM80-NEXT: .reg .b16 %rs<5 >;
528
+ ; SM80-NEXT: .reg .b16 %rs<7 >;
517
529
; SM80-NEXT: .reg .b32 %r<4>;
518
530
; SM80-NEXT: .reg .f32 %f<7>;
519
531
; SM80-EMPTY:
@@ -525,16 +537,18 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525
537
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
526
538
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
527
539
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
528
541
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
529
542
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
530
543
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
531
- ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
544
+ ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545
+ ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
532
546
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
533
547
; SM80-NEXT: ret;
534
548
;
535
549
; SM80-FTZ-LABEL: test_fdiv(
536
550
; SM80-FTZ: {
537
- ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
551
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
538
552
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
539
553
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
540
554
; SM80-FTZ-EMPTY:
@@ -546,16 +560,18 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
546
560
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
547
561
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
548
562
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
549
564
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
550
565
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
551
566
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
552
- ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
567
+ ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568
+ ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
553
569
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
554
570
; SM80-FTZ-NEXT: ret;
555
571
;
556
572
; SM90-LABEL: test_fdiv(
557
573
; SM90: {
558
- ; SM90-NEXT: .reg .b16 %rs<5 >;
574
+ ; SM90-NEXT: .reg .b16 %rs<7 >;
559
575
; SM90-NEXT: .reg .b32 %r<4>;
560
576
; SM90-NEXT: .reg .f32 %f<7>;
561
577
; SM90-EMPTY:
@@ -567,10 +583,12 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
567
583
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
568
584
; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
569
585
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586
+ ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
570
587
; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
571
588
; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
572
589
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
573
- ; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
590
+ ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591
+ ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
574
592
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
575
593
; SM90-NEXT: ret;
576
594
%r = fdiv <2 x bfloat> %a , %b
0 commit comments