@@ -47,7 +47,7 @@ func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, #gpu.address_spac
47
47
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>
48
48
49
49
// Verify that the operandB load is lowered to scalar load to be able
50
- // to transpose at 8-bit granularity. ldmatrix can only transpose at
50
+ // to transpose at 8-bit granularity. ldmatrix can only transpose at
51
51
// 16-bit granularity.
52
52
53
53
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]
@@ -282,7 +282,7 @@ func.func @multi_dim_m16n8k16_fp16_row_row_row(%arg0: memref<4x32x1x32xf16, #gpu
282
282
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
283
283
// CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
284
284
%B = vector.transfer_read %arg1 [%c0 , %c0 , %c0 , %c0 ], %cst {in_bounds = [true , true ], permutation_map = #map_b } : memref <4 x1 x32 x32 xf16 , #gpu.address_space <workgroup >>, vector <16 x16 xf16 >
285
-
285
+
286
286
// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
287
287
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
288
288
// CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
@@ -713,3 +713,125 @@ func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, #gpu.address_spac
713
713
vector.transfer_write %D , %arg2 [%c0 , %c0 ] {in_bounds = [true , true ]} : vector <16 x8 xi32 >, memref <128 x128 xi32 >
714
714
return
715
715
}
716
+
717
+ // -----
718
+
719
+
720
+ #map0 = affine_map <(d0 , d1 ) -> (d1 , d0 )>
721
+ #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>
722
+ #map2 = affine_map <(d0 , d1 , d2 ) -> (d1 , d2 )>
723
+ #map3 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>
724
+ !smem_type = memref <20 x20 xf16 , strided <[?, 1 ], offset : ?>, #gpu.address_space <workgroup >>
725
+
726
+ // This test case is identical to m16n8k16 test case, but it tests that having
727
+ // n row dimension with unknown stride is handled correctly.
728
+
729
+ // CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
730
+ // CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
731
+ // CHECK-LABEL: func @strided_memref_read_write
732
+ func.func @strided_memref_read_write (%arg0: !smem_type ,
733
+ %arg1: !smem_type ,
734
+ %arg2: !smem_type ) {
735
+ %cst_0 = arith.constant dense <0.000000e+00 > : vector <16 x8 xf16 >
736
+ %c0 = arith.constant 0 : index
737
+ %cst = arith.constant 0.000000e+00 : f16
738
+
739
+ // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
740
+ // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
741
+ // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
742
+ // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
743
+ // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
744
+ // CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true}
745
+ %A = vector.transfer_read %arg0 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x16 xf16 >
746
+ %B = vector.transfer_read %arg1 [%c0 , %c0 ], %cst {permutation_map = #map0 , in_bounds = [true , true ]} : !smem_type , vector <8 x16 xf16 >
747
+ %C = vector.transfer_read %arg2 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x8 xf16 >
748
+ %D = vector.contract {index ing_maps = [#map1 , #map2 , #map3 ], iterator_types = [" parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >}
749
+ %A , %B , %C : vector <16 x16 xf16 >, vector <8 x16 xf16 > into vector <16 x8 xf16 >
750
+ vector.transfer_write %D , %arg2 [%c0 , %c0 ] {in_bounds = [true , true ]} : vector <16 x8 xf16 >, !smem_type
751
+ return
752
+ }
753
+
754
+ // -----
755
+
756
+
757
+ #map0 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>
758
+ #map1 = affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 , d3 )>
759
+ #map2 = affine_map <(d0 , d1 , d2 , d3 ) -> (d2 , d0 , d3 )>
760
+ #map3 = affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 , d2 )>
761
+ !smem_type = memref <20 x20 x20 xf16 , strided <[?, ?, 1 ], offset : ?>, #gpu.address_space <workgroup >>
762
+
763
+ // CHECK-LABEL: func @unsupported_non_2d_load_store
764
+ func.func @unsupported_non_2d_load_store (%arg0: !smem_type ,
765
+ %arg1: !smem_type ,
766
+ %arg2: !smem_type ) {
767
+ %cst_0 = arith.constant dense <0.000000e+00 > : vector <16 x8 xf16 >
768
+ %c0 = arith.constant 0 : index
769
+ %cst = arith.constant 0.000000e+00 : f16
770
+
771
+ // CHECK-NOT: nvgpu.ldmatrix
772
+ // CHECK-NOT: nvgpu.mma
773
+ %A = vector.transfer_read %arg0 [%c0 , %c0 , %c0 ], %cst {in_bounds = [true , true , true ]} : !smem_type , vector <1 x16 x16 xf16 >
774
+ %B = vector.transfer_read %arg1 [%c0 , %c0 , %c0 ], %cst {permutation_map = #map0 , in_bounds = [true , true , true ]} : !smem_type , vector <8 x1 x16 xf16 >
775
+ %C = vector.transfer_read %arg2 [%c0 , %c0 , %c0 ], %cst {in_bounds = [true , true , true ]} : !smem_type , vector <1 x16 x8 xf16 >
776
+ %D = vector.contract {index ing_maps = [#map1 , #map2 , #map3 ], iterator_types = [" parallel" , " parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >}
777
+ %A , %B , %C : vector <1 x16 x16 xf16 >, vector <8 x1 x16 xf16 > into vector <1 x16 x8 xf16 >
778
+ vector.transfer_write %D , %arg2 [%c0 , %c0 , %c0 ] {in_bounds = [true , true , true ]} : vector <1 x16 x8 xf16 >, !smem_type
779
+ return
780
+ }
781
+
782
+ // -----
783
+
784
+ #map0 = affine_map <(d0 , d1 ) -> (d1 , d0 )>
785
+ #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>
786
+ #map2 = affine_map <(d0 , d1 , d2 ) -> (d1 , d2 )>
787
+ #map3 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>
788
+
789
+ !smem_type = memref <20 x20 xf16 , strided <[?, ?], offset : ?>, #gpu.address_space <workgroup >>
790
+
791
+ // CHECK-LABEL: func @unsupported_fully_dynamic_strides
792
+ func.func @unsupported_fully_dynamic_strides (%arg0: !smem_type ,
793
+ %arg1: !smem_type ,
794
+ %arg2: !smem_type ) {
795
+ %cst_0 = arith.constant dense <0.000000e+00 > : vector <16 x8 xf16 >
796
+ %c0 = arith.constant 0 : index
797
+ %cst = arith.constant 0.000000e+00 : f16
798
+
799
+ // CHECK-NOT: nvgpu.ldmatrix
800
+ // CHECK-NOT: nvgpu.mma
801
+ %A = vector.transfer_read %arg0 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x16 xf16 >
802
+ %B = vector.transfer_read %arg1 [%c0 , %c0 ], %cst {permutation_map = #map0 , in_bounds = [true , true ]} : !smem_type , vector <8 x16 xf16 >
803
+ %C = vector.transfer_read %arg2 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x8 xf16 >
804
+ %D = vector.contract {index ing_maps = [#map1 , #map2 , #map3 ], iterator_types = [" parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >}
805
+ %A , %B , %C : vector <16 x16 xf16 >, vector <8 x16 xf16 > into vector <16 x8 xf16 >
806
+ vector.transfer_write %D , %arg2 [%c0 , %c0 ] {in_bounds = [true , true ]} : vector <16 x8 xf16 >, !smem_type
807
+ return
808
+ }
809
+
810
+ // -----
811
+
812
+ #map0 = affine_map <(d0 , d1 ) -> (d1 , d0 )>
813
+ #map1 = affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>
814
+ #map2 = affine_map <(d0 , d1 , d2 ) -> (d1 , d2 )>
815
+ #map3 = affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>
816
+
817
+
818
+ !smem_type = memref <20 x20 xf16 , strided <[?, 1 ], offset : ?>, #gpu.address_space <workgroup >>
819
+
820
+ // CHECK-LABEL: func @unsupported_transposed_store
821
+ func.func @unsupported_transposed_store (%arg0: !smem_type ,
822
+ %arg1: !smem_type ,
823
+ %arg2: !smem_type ) {
824
+ %cst_0 = arith.constant dense <0.000000e+00 > : vector <16 x8 xf16 >
825
+ %c0 = arith.constant 0 : index
826
+ %cst = arith.constant 0.000000e+00 : f16
827
+
828
+ // CHECK-NOT: nvgpu.ldmatrix
829
+ // CHECK-NOT: nvgpu.mma
830
+ %A = vector.transfer_read %arg0 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x16 xf16 >
831
+ %B = vector.transfer_read %arg1 [%c0 , %c0 ], %cst {permutation_map = #map0 , in_bounds = [true , true ]} : !smem_type , vector <8 x16 xf16 >
832
+ %C = vector.transfer_read %arg2 [%c0 , %c0 ], %cst {in_bounds = [true , true ]} : !smem_type , vector <16 x8 xf16 >
833
+ %D = vector.contract {index ing_maps = [#map1 , #map2 , #map3 ], iterator_types = [" parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >}
834
+ %A , %B , %C : vector <16 x16 xf16 >, vector <8 x16 xf16 > into vector <16 x8 xf16 >
835
+ vector.transfer_write %D , %arg2 [%c0 , %c0 ] {in_bounds = [true , true ], permutation_map = affine_map <(d0 , d1 )->(d1 , d0 )>} : vector <16 x8 xf16 >, !smem_type
836
+ return
837
+ }
0 commit comments