Skip to content

Commit 4b24ab4

Browse files
authored
Reland "[NVPTX] Add folding for cvt.rn.bf16x2.f32" (#116417)
Reland #116109. Fixes issue where operands were flipped. Per the PTX spec, a mov instruction packs the first operand as low, and the second operand as high: > ``` > // pack two 16-bit elements into .b32 > d = a.x | (a.y << 16) > ``` On the other hand cvt.rn.f16x2.f32 instructions take high, than low operands: > For .f16x2 and .bf16x2 instruction type, two inputs a and b of .f32 type are converted into .f16 or .bf16 type and the converted values are packed in the destination register d, such that the value converted from input a is stored in the upper half of d and the value converted from input b is stored in the lower half of d
1 parent 7ff8929 commit 4b24ab4

File tree

8 files changed

+379
-304
lines changed

8 files changed

+379
-304
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,20 @@ let hasSideEffects = false in {
727727
def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
728728
}
729729

730+
def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
731+
return N->hasOneUse();
732+
}]>;
733+
734+
def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse Float32Regs:$lo)),
735+
(bf16 (fpround_oneuse Float32Regs:$hi)))),
736+
(CVT_bf16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
737+
Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
738+
739+
def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse Float32Regs:$lo)),
740+
(f16 (fpround_oneuse Float32Regs:$hi)))),
741+
(CVT_f16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
742+
Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
743+
730744
//-----------------------------------
731745
// Selection instructions (selp)
732746
//-----------------------------------

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 54 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -204,47 +204,43 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
204204
;
205205
; SM80-LABEL: test_faddx2(
206206
; SM80: {
207-
; SM80-NEXT: .reg .b16 %rs<7>;
207+
; SM80-NEXT: .reg .b16 %rs<5>;
208208
; SM80-NEXT: .reg .b32 %r<4>;
209209
; SM80-NEXT: .reg .f32 %f<7>;
210210
; SM80-EMPTY:
211211
; SM80-NEXT: // %bb.0:
212212
; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
213213
; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
214214
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
215-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
215+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
216216
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
217+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
218218
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
219+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
220+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
222221
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
222+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
225223
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
226224
; SM80-NEXT: ret;
227225
;
228226
; SM80-FTZ-LABEL: test_faddx2(
229227
; SM80-FTZ: {
230-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
228+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
231229
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232230
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
233231
; SM80-FTZ-EMPTY:
234232
; SM80-FTZ-NEXT: // %bb.0:
235233
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
236234
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
237235
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
238-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
236+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
239237
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
238+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
241239
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
240+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
241+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
245242
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
243+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
248244
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249245
; SM80-FTZ-NEXT: ret;
250246
;
@@ -311,47 +307,43 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
311307
;
312308
; SM80-LABEL: test_fsubx2(
313309
; SM80: {
314-
; SM80-NEXT: .reg .b16 %rs<7>;
310+
; SM80-NEXT: .reg .b16 %rs<5>;
315311
; SM80-NEXT: .reg .b32 %r<4>;
316312
; SM80-NEXT: .reg .f32 %f<7>;
317313
; SM80-EMPTY:
318314
; SM80-NEXT: // %bb.0:
319315
; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
320316
; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
321317
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
322-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
318+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
323319
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
320+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
325321
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
322+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
323+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
329324
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
325+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
332326
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
333327
; SM80-NEXT: ret;
334328
;
335329
; SM80-FTZ-LABEL: test_fsubx2(
336330
; SM80-FTZ: {
337-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
331+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
338332
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339333
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
340334
; SM80-FTZ-EMPTY:
341335
; SM80-FTZ-NEXT: // %bb.0:
342336
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
343337
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
344338
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
345-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
339+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
346340
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
341+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
348342
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
343+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
344+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
352345
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
346+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
355347
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356348
; SM80-FTZ-NEXT: ret;
357349
;
@@ -418,47 +410,43 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
418410
;
419411
; SM80-LABEL: test_fmulx2(
420412
; SM80: {
421-
; SM80-NEXT: .reg .b16 %rs<7>;
413+
; SM80-NEXT: .reg .b16 %rs<5>;
422414
; SM80-NEXT: .reg .b32 %r<4>;
423415
; SM80-NEXT: .reg .f32 %f<7>;
424416
; SM80-EMPTY:
425417
; SM80-NEXT: // %bb.0:
426418
; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
427419
; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
428420
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
429-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
421+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
430422
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
423+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
432424
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
425+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
426+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
436427
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
428+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
439429
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
440430
; SM80-NEXT: ret;
441431
;
442432
; SM80-FTZ-LABEL: test_fmulx2(
443433
; SM80-FTZ: {
444-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
434+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
445435
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446436
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
447437
; SM80-FTZ-EMPTY:
448438
; SM80-FTZ-NEXT: // %bb.0:
449439
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
450440
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
451441
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
452-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
442+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
453443
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
444+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
455445
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
446+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
447+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
459448
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
449+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
462450
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463451
; SM80-FTZ-NEXT: ret;
464452
;
@@ -525,70 +513,64 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
525513
;
526514
; SM80-LABEL: test_fdiv(
527515
; SM80: {
528-
; SM80-NEXT: .reg .b16 %rs<7>;
516+
; SM80-NEXT: .reg .b16 %rs<5>;
529517
; SM80-NEXT: .reg .b32 %r<4>;
530518
; SM80-NEXT: .reg .f32 %f<7>;
531519
; SM80-EMPTY:
532520
; SM80-NEXT: // %bb.0:
533521
; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
534522
; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
535523
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
536-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
524+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
537525
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
526+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
539527
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540-
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
528+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
529+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
543530
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544-
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545-
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
531+
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
546532
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
547533
; SM80-NEXT: ret;
548534
;
549535
; SM80-FTZ-LABEL: test_fdiv(
550536
; SM80-FTZ: {
551-
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
537+
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
552538
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553539
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
554540
; SM80-FTZ-EMPTY:
555541
; SM80-FTZ-NEXT: // %bb.0:
556542
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
557543
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
558544
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
559-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
545+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
560546
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
547+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
562548
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
549+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
550+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
566551
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567-
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568-
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
552+
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
569553
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570554
; SM80-FTZ-NEXT: ret;
571555
;
572556
; SM90-LABEL: test_fdiv(
573557
; SM90: {
574-
; SM90-NEXT: .reg .b16 %rs<7>;
558+
; SM90-NEXT: .reg .b16 %rs<5>;
575559
; SM90-NEXT: .reg .b32 %r<4>;
576560
; SM90-NEXT: .reg .f32 %f<7>;
577561
; SM90-EMPTY:
578562
; SM90-NEXT: // %bb.0:
579563
; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
580564
; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
581565
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
582-
; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
566+
; SM90-NEXT: cvt.f32.bf16 %f1, %rs1;
583567
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584-
; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
568+
; SM90-NEXT: cvt.f32.bf16 %f2, %rs3;
585569
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586-
; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587-
; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588-
; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
570+
; SM90-NEXT: cvt.f32.bf16 %f4, %rs2;
571+
; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
589572
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590-
; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591-
; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
573+
; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
592574
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
593575
; SM90-NEXT: ret;
594576
%r = fdiv <2 x bfloat> %a, %b

llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,18 @@ declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
1010
define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
1111
; CHECK-LABEL: test_sin(
1212
; CHECK: {
13-
; CHECK-NEXT: .reg .b16 %rs<5>;
13+
; CHECK-NEXT: .reg .b16 %rs<3>;
1414
; CHECK-NEXT: .reg .b32 %r<3>;
1515
; CHECK-NEXT: .reg .f32 %f<5>;
1616
; CHECK-EMPTY:
1717
; CHECK-NEXT: // %bb.0:
1818
; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0];
1919
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
20-
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
20+
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1;
2121
; CHECK-NEXT: sin.approx.f32 %f2, %f1;
22-
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
23-
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
22+
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs2;
2423
; CHECK-NEXT: sin.approx.f32 %f4, %f3;
25-
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
26-
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
24+
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2;
2725
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
2826
; CHECK-NEXT: ret;
2927
%r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
@@ -33,20 +31,18 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
3331
define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
3432
; CHECK-LABEL: test_cos(
3533
; CHECK: {
36-
; CHECK-NEXT: .reg .b16 %rs<5>;
34+
; CHECK-NEXT: .reg .b16 %rs<3>;
3735
; CHECK-NEXT: .reg .b32 %r<3>;
3836
; CHECK-NEXT: .reg .f32 %f<5>;
3937
; CHECK-EMPTY:
4038
; CHECK-NEXT: // %bb.0:
4139
; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0];
4240
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
43-
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
41+
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1;
4442
; CHECK-NEXT: cos.approx.f32 %f2, %f1;
45-
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
46-
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
43+
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs2;
4744
; CHECK-NEXT: cos.approx.f32 %f4, %f3;
48-
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
49-
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
45+
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2;
5046
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
5147
; CHECK-NEXT: ret;
5248
%r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)

0 commit comments

Comments
 (0)