@@ -77,6 +77,7 @@ int main() {
77
77
// CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
78
78
joint_matrix_load (sg, sub_c, accC.get_pointer (), N);
79
79
80
+ // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
80
81
// Round a, b to tf32
81
82
for (auto i = 0 ; i < 4 ; ++i)
82
83
sub_a.data [i] = float_to_tf32 (sub_a.data [i]);
@@ -120,14 +121,15 @@ int main() {
120
121
joint_matrix_load (sg, sub_b, accB.get_pointer (), N);
121
122
// CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
122
123
joint_matrix_load (sg, sub_c, accC.get_pointer (), N);
123
-
124
+
125
+ // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
124
126
// Round a, b to tf32
125
127
for (auto i = 0 ; i < 4 ; ++i)
126
128
sub_a.data [i] = float_to_tf32 (sub_a.data [i]);
127
129
128
130
for (auto i = 0 ; i < 4 ; ++i)
129
131
sub_b.data [i] = float_to_tf32 (sub_b.data [i]);
130
-
132
+
131
133
// CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
132
134
sub_c = joint_matrix_mad (sg, sub_a, sub_b, sub_c);
133
135
// CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16) #{{.*}}
0 commit comments