@@ -204,47 +204,43 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204
204
;
205
205
; SM80-LABEL: test_faddx2(
206
206
; SM80: {
207
- ; SM80-NEXT: .reg .b16 %rs<7 >;
207
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
208
208
; SM80-NEXT: .reg .b32 %r<4>;
209
209
; SM80-NEXT: .reg .f32 %f<7>;
210
210
; SM80-EMPTY:
211
211
; SM80-NEXT: // %bb.0:
212
212
; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
213
213
; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
214
214
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
215
- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
215
+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
216
216
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217
- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
217
+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
218
218
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220
- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221
- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
219
+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
220
+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
222
221
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
222
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
225
223
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
226
224
; SM80-NEXT: ret;
227
225
;
228
226
; SM80-FTZ-LABEL: test_faddx2(
229
227
; SM80-FTZ: {
230
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
228
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
231
229
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232
230
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
233
231
; SM80-FTZ-EMPTY:
234
232
; SM80-FTZ-NEXT: // %bb.0:
235
233
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
236
234
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
237
235
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
238
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
236
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
239
237
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
238
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
241
239
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
240
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
241
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
245
242
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
243
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
248
244
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249
245
; SM80-FTZ-NEXT: ret;
250
246
;
@@ -311,47 +307,43 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
311
307
;
312
308
; SM80-LABEL: test_fsubx2(
313
309
; SM80: {
314
- ; SM80-NEXT: .reg .b16 %rs<7 >;
310
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
315
311
; SM80-NEXT: .reg .b32 %r<4>;
316
312
; SM80-NEXT: .reg .f32 %f<7>;
317
313
; SM80-EMPTY:
318
314
; SM80-NEXT: // %bb.0:
319
315
; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
320
316
; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
321
317
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
322
- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
318
+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
323
319
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324
- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
320
+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
325
321
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327
- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328
- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
322
+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
323
+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
329
324
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
325
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
332
326
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
333
327
; SM80-NEXT: ret;
334
328
;
335
329
; SM80-FTZ-LABEL: test_fsubx2(
336
330
; SM80-FTZ: {
337
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
331
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
338
332
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339
333
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
340
334
; SM80-FTZ-EMPTY:
341
335
; SM80-FTZ-NEXT: // %bb.0:
342
336
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
343
337
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
344
338
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
345
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
339
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
346
340
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
341
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
348
342
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
343
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
344
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
352
345
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
346
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
355
347
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356
348
; SM80-FTZ-NEXT: ret;
357
349
;
@@ -418,47 +410,43 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
418
410
;
419
411
; SM80-LABEL: test_fmulx2(
420
412
; SM80: {
421
- ; SM80-NEXT: .reg .b16 %rs<7 >;
413
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
422
414
; SM80-NEXT: .reg .b32 %r<4>;
423
415
; SM80-NEXT: .reg .f32 %f<7>;
424
416
; SM80-EMPTY:
425
417
; SM80-NEXT: // %bb.0:
426
418
; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
427
419
; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
428
420
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
429
- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
421
+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
430
422
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431
- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
423
+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
432
424
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434
- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435
- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
425
+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
426
+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
436
427
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
428
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
439
429
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
440
430
; SM80-NEXT: ret;
441
431
;
442
432
; SM80-FTZ-LABEL: test_fmulx2(
443
433
; SM80-FTZ: {
444
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
434
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
445
435
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446
436
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
447
437
; SM80-FTZ-EMPTY:
448
438
; SM80-FTZ-NEXT: // %bb.0:
449
439
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
450
440
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
451
441
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
452
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
442
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
453
443
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
444
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
455
445
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
446
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
447
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
459
448
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
449
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
462
450
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463
451
; SM80-FTZ-NEXT: ret;
464
452
;
@@ -525,70 +513,64 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525
513
;
526
514
; SM80-LABEL: test_fdiv(
527
515
; SM80: {
528
- ; SM80-NEXT: .reg .b16 %rs<7 >;
516
+ ; SM80-NEXT: .reg .b16 %rs<5 >;
529
517
; SM80-NEXT: .reg .b32 %r<4>;
530
518
; SM80-NEXT: .reg .f32 %f<7>;
531
519
; SM80-EMPTY:
532
520
; SM80-NEXT: // %bb.0:
533
521
; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
534
522
; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
535
523
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
536
- ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2 ;
524
+ ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1 ;
537
525
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538
- ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4 ;
526
+ ; SM80-NEXT: cvt.f32.bf16 %f2, %rs3 ;
539
527
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541
- ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542
- ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
528
+ ; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
529
+ ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
543
530
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544
- ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545
- ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
531
+ ; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
546
532
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
547
533
; SM80-NEXT: ret;
548
534
;
549
535
; SM80-FTZ-LABEL: test_fdiv(
550
536
; SM80-FTZ: {
551
- ; SM80-FTZ-NEXT: .reg .b16 %rs<7 >;
537
+ ; SM80-FTZ-NEXT: .reg .b16 %rs<5 >;
552
538
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553
539
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
554
540
; SM80-FTZ-EMPTY:
555
541
; SM80-FTZ-NEXT: // %bb.0:
556
542
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
557
543
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
558
544
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
559
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2 ;
545
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1 ;
560
546
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4 ;
547
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3 ;
562
548
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565
- ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
549
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
550
+ ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
566
551
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567
- ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568
- ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
552
+ ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
569
553
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570
554
; SM80-FTZ-NEXT: ret;
571
555
;
572
556
; SM90-LABEL: test_fdiv(
573
557
; SM90: {
574
- ; SM90-NEXT: .reg .b16 %rs<7 >;
558
+ ; SM90-NEXT: .reg .b16 %rs<5 >;
575
559
; SM90-NEXT: .reg .b32 %r<4>;
576
560
; SM90-NEXT: .reg .f32 %f<7>;
577
561
; SM90-EMPTY:
578
562
; SM90-NEXT: // %bb.0:
579
563
; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
580
564
; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
581
565
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
582
- ; SM90-NEXT: cvt.f32.bf16 %f1, %rs2 ;
566
+ ; SM90-NEXT: cvt.f32.bf16 %f1, %rs1 ;
583
567
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584
- ; SM90-NEXT: cvt.f32.bf16 %f2, %rs4 ;
568
+ ; SM90-NEXT: cvt.f32.bf16 %f2, %rs3 ;
585
569
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586
- ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587
- ; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588
- ; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
570
+ ; SM90-NEXT: cvt.f32.bf16 %f4, %rs2;
571
+ ; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
589
572
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590
- ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591
- ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
573
+ ; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
592
574
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
593
575
; SM90-NEXT: ret;
594
576
%r = fdiv <2 x bfloat> %a , %b
0 commit comments