Skip to content

Reland "[NVPTX] Add folding for cvt.rn.bf16x2.f32" #116417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,20 @@ let hasSideEffects = false in {
def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
}

def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
return N->hasOneUse();
}]>;

def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse Float32Regs:$lo)),
(bf16 (fpround_oneuse Float32Regs:$hi)))),
(CVT_bf16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;

def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse Float32Regs:$lo)),
(f16 (fpround_oneuse Float32Regs:$hi)))),
(CVT_f16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;

//-----------------------------------
// Selection instructions (selp)
//-----------------------------------
Expand Down
126 changes: 54 additions & 72 deletions llvm/test/CodeGen/NVPTX/bf16-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -204,47 +204,43 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_faddx2(
; SM80: {
; SM80-NEXT: .reg .b16 %rs<7>;
; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-NEXT: .reg .b32 %r<4>;
; SM80-NEXT: .reg .f32 %f<7>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_faddx2(
; SM80-FTZ: {
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
Expand Down Expand Up @@ -311,47 +307,43 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_fsubx2(
; SM80: {
; SM80-NEXT: .reg .b16 %rs<7>;
; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-NEXT: .reg .b32 %r<4>;
; SM80-NEXT: .reg .f32 %f<7>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fsubx2(
; SM80-FTZ: {
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
Expand Down Expand Up @@ -418,47 +410,43 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_fmulx2(
; SM80: {
; SM80-NEXT: .reg .b16 %rs<7>;
; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-NEXT: .reg .b32 %r<4>;
; SM80-NEXT: .reg .f32 %f<7>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fmulx2(
; SM80-FTZ: {
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
Expand Down Expand Up @@ -525,70 +513,64 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
;
; SM80-LABEL: test_fdiv(
; SM80: {
; SM80-NEXT: .reg .b16 %rs<7>;
; SM80-NEXT: .reg .b16 %rs<5>;
; SM80-NEXT: .reg .b32 %r<4>;
; SM80-NEXT: .reg .f32 %f<7>;
; SM80-EMPTY:
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
; SM80-NEXT: cvt.f32.bf16 %f2, %rs3;
; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
; SM80-NEXT: cvt.f32.bf16 %f4, %rs2;
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-NEXT: ret;
;
; SM80-FTZ-LABEL: test_fdiv(
; SM80-FTZ: {
; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
; SM80-FTZ-NEXT: .reg .b16 %rs<5>;
; SM80-FTZ-NEXT: .reg .b32 %r<4>;
; SM80-FTZ-NEXT: .reg .f32 %f<7>;
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs3;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs2;
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
; SM90-LABEL: test_fdiv(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<7>;
; SM90-NEXT: .reg .b16 %rs<5>;
; SM90-NEXT: .reg .b32 %r<4>;
; SM90-NEXT: .reg .f32 %f<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
; SM90-NEXT: cvt.f32.bf16 %f1, %rs1;
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
; SM90-NEXT: cvt.f32.bf16 %f2, %rs3;
; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
; SM90-NEXT: cvt.f32.bf16 %f4, %rs2;
; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
; SM90-NEXT: cvt.rn.bf16x2.f32 %r3, %f6, %f3;
; SM90-NEXT: st.param.b32 [func_retval0], %r3;
; SM90-NEXT: ret;
%r = fdiv <2 x bfloat> %a, %b
Expand Down
20 changes: 8 additions & 12 deletions llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,18 @@ declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
; CHECK-LABEL: test_sin(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1;
; CHECK-NEXT: sin.approx.f32 %f2, %f1;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs2;
; CHECK-NEXT: sin.approx.f32 %f4, %f3;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
Expand All @@ -33,20 +31,18 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
; CHECK-LABEL: test_cos(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs1;
; CHECK-NEXT: cos.approx.f32 %f2, %f1;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs2;
; CHECK-NEXT: cos.approx.f32 %f4, %f3;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r2, %f4, %f2;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
Expand Down
Loading
Loading