@@ -9,55 +9,72 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad
9
9
%res = vector.transfer_read %mem [%idx , %idx ], %cf0 , %mask {in_bounds = [true ]} : memref <8 x8 xf32 , #amdgpu.address_space <fat_raw_buffer >>, vector <4 xf32 >
10
10
return %res : vector <4 xf32 >
11
11
}
12
- // CHECK: %[[CST:.*]] = arith.constant 0.0
13
- // CHECK: %[[C0:.*]] = arith.constant 0
14
- // CHECK: %[[C1:.*]] = arith.constant 1
15
- // CHECK: %[[MUL0:.*]] = arith.muli %[[ARG1]], %[[C1]]
16
- // CHECK: %[[ADD0:.*]] = arith.addi %[[C0]], %[[MUL0]]
17
- // CHECK: %[[C8:.*]] = arith.constant 8
18
- // CHECK: %[[MUL1:.*]] = arith.muli %[[C1]], %[[C8]]
19
- // CHECK: %[[MUL2:.*]] = arith.muli %[[ARG1]], %[[MUL1]]
20
- // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[MUL2]]
21
- // CHECK: %[[C4:.*]] = arith.constant 4
22
- // CHECK: %[[ADD2:.*]] = arith.addi %[[ADD1]], %[[C4]]
23
-
24
- // CHECK: %[[MUL3:.*]] = arith.muli %[[C1]], %[[C8]]
25
- // CHECK: %[[MUL4:.*]] = arith.muli
26
-
27
- // CHECK: %[[CMP:.*]] = arith.cmpi ule, %[[ADD2]], %[[MUL4]]
28
- // CHECK: %[[IF:.*]] = scf.if %[[CMP]] -> (vector<4xf32>) {
29
-
30
- // CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
31
- // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
32
- // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]]
12
+
13
+ // CHECK: %[[FALSE:.*]] = arith.constant false
14
+ // CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
15
+ // CHECK: vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[ARG2]]
33
16
34
17
// CHECK: } else {
35
- // CHECK: %[[LOAD:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {amdgpu.transformed, in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf32>
18
+ // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
19
+ // CHECK: %[[SELECT:.*]] = arith.select %[[ARG2]], %[[LOAD]]
36
20
37
21
// CHECK: return %[[IF]] : vector<4xf32>
38
22
39
23
// -----
40
24
41
- // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_dynamic(
42
- // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
43
- // CHECK-SAME: %[[ARG1:.*]]: index
44
- // CHECK-SAME: %[[ARG2:.*]]: vector<4xi1>
45
- func.func @transfer_to_maskedload_fatrawbuffer_dynamic (%mem : memref <?x?xf32 , #amdgpu.address_space <fat_raw_buffer >>, %idx : index , %mask : vector <4 xi1 >) -> vector <4 xf32 > {
46
- %cf0 = arith.constant 0.0 : f32
47
- %res = vector.transfer_read %mem [%idx , %idx ], %cf0 , %mask {in_bounds = [true ]} : memref <?x?xf32 , #amdgpu.address_space <fat_raw_buffer >>, vector <4 xf32 >
48
- return %res : vector <4 xf32 >
25
+ // CHECK: #map = affine_map<()[s0, s1] -> (s0 * 8 + s1)>
26
+ // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_f16(
27
+ // CHECK-SAME: %[[ARG0:.+]]: memref<8x8xf16, #amdgpu.address_space<fat_raw_buffer>>,
28
+ // CHECK-SAME: %[[ARG1:.+]]: index, %[[ARG2:.+]]: index,
29
+ // CHECK-SAME: %[[ARG3:.+]]: vector<4xi1>)
30
+ func.func @transfer_to_maskedload_fatrawbuffer_f16 (%mem : memref <8 x8 xf16 , #amdgpu.address_space <fat_raw_buffer >>, %idx0 : index , %idx1 : index , %mask : vector <4 xi1 >) -> vector <4 xf16 > {
31
+ %cf0 = arith.constant 0.0 : f16
32
+ %res = vector.transfer_read %mem [%idx0 , %idx1 ], %cf0 , %mask {in_bounds = [true ]} : memref <8 x8 xf16 , #amdgpu.address_space <fat_raw_buffer >>, vector <4 xf16 >
33
+ return %res : vector <4 xf16 >
49
34
}
35
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0
36
+ // CHECK-DAG: %[[SIZE:.*]] = arith.constant 64
37
+ // CHECK-DAG: %[[BYTES:.*]] = arith.constant 2
38
+ // CHECK-DAG: %[[VECTORSIZE:.*]] = arith.constant 4
39
+
40
+ // CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[ARG2]]]
41
+ // CHECK: %[[DELTA:.*]] = arith.subi %[[SIZE]], %[[LINEAR]]
42
+ // CHECK: %[[COND1:.*]] = arith.cmpi ule, %[[DELTA]], %[[VECTORSIZE]]
43
+
44
+ // CHECK: %[[DELTABYTES:.*]] = arith.muli %[[DELTA]], %[[BYTES]]
45
+ // CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[BYTES]]
46
+ // CHECK: %[[COND2:.*]] = arith.cmpi ne, %[[REM]], %[[C0]]
47
+
48
+ // CHECK: %[[COND:.*]] = arith.andi %[[COND1]], %[[COND2]]
49
+ // CHECK: %[[IF:.*]] = scf.if %[[COND]] -> (vector<4xf16>) {
50
+ // CHECK: vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG2]]], %[[ARG3]]
51
+ // CHECK: } else {
52
+ // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG2]]]
53
+ // CHECK: return %[[IF]] : vector<4xf16>
54
+
55
+ // -----
50
56
51
- // CHECK: %[[C1:.*]] = arith.constant 1
52
- // CHECK: %[[DIM1:.*]] = memref.dim %[[ARG0]], %[[C1]]
53
- // CHECK: %[[MUL0:.*]] = arith.muli %{{.*}}, %[[DIM1]]
54
- // CHECK: %[[C0:.*]] = arith.constant 0
55
- // CHECK: %[[DIM0:.*]] = memref.dim %[[ARG0]], %[[C0]]
56
- // CHECK: %[[MUL1:.*]] = arith.muli %{{.*}}, %[[DIM0]]
57
+ // CHECK: #map = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
58
+ // CHECK: #map1 = affine_map<()[s0, s1, s2, s3] -> (s0 * s1, s2 * s3)>
59
+ // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(
60
+ // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xi8, #amdgpu.address_space<fat_raw_buffer>>
61
+ // CHECK-SAME: %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
62
+ // CHECK-SAME: %[[ARG3:.*]]: vector<4xi1>
63
+ func.func @transfer_to_maskedload_fatrawbuffer_dynamic_i8 (%mem : memref <?x?xi8 , #amdgpu.address_space <fat_raw_buffer >>, %idx0 : index , %idx1 : index , %mask : vector <4 xi1 >) -> vector <4 xi8 > {
64
+ %cf0 = arith.constant 0 : i8
65
+ %res = vector.transfer_read %mem [%idx0 , %idx1 ], %cf0 , %mask {in_bounds = [true ]} : memref <?x?xi8 , #amdgpu.address_space <fat_raw_buffer >>, vector <4 xi8 >
66
+ return %res : vector <4 xi8 >
67
+ }
57
68
58
- // CHECK: %[[C1_1:.*]] = arith.constant 1
59
- // CHECK: %[[DIM1_1:.*]] = memref.dim %[[ARG0]], %[[C1_1]]
60
- // CHECK: %[[MUL2:.*]] = arith.muli %{{.*}}, %[[DIM1_1]]
69
+ // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<4xi8>
70
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
71
+ // CHECK: %[[C4:.*]] = arith.constant 4 : index
72
+ // CHECK: %[[C1:.*]] = arith.constant 1 : index
73
+ // CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG0]]
74
+ // CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[STRIDES]]#0, %[[ARG2]]]
75
+ // CHECK: %[[SIZE:.*]] = affine.max #map1()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[C1]], %[[SIZES]]#1]
76
+ // CHECK: %[[IF:.*]] = scf.if
77
+ // CHECK: return
61
78
62
79
// -----
63
80
@@ -70,8 +87,8 @@ func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index,
70
87
%res = vector.transfer_read %mem [%idx , %idx ], %cf0 , %mask {in_bounds = [true ]} : memref <8 x8 xf32 >, vector <4 xf32 >
71
88
return %res : vector <4 xf32 >
72
89
}
73
- // CHECK: %[[CST:.*]] = arith.constant 0.0
74
- // CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1 , %arg1] , %[[CST ]], %arg2 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32>
90
+ // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
91
+ // CHECK: %[[RES:.*]] = vector.maskedload %[[ARG0]][%[[ARG1]] , %[[ARG1]]] , %[[ARG2 ]], %[[CST]]
75
92
// CHECK: return %[[RES]] : vector<4xf32>
76
93
77
94
// -----
@@ -85,8 +102,8 @@ func.func @transfer_to_maskedload_addrspace(%mem : memref<8x8xf32, #gpu.address_
85
102
%res = vector.transfer_read %mem [%idx , %idx ], %cf0 , %mask {in_bounds = [true ]} : memref <8 x8 xf32 , #gpu.address_space <workgroup >>, vector <4 xf32 >
86
103
return %res : vector <4 xf32 >
87
104
}
88
- // CHECK: %[[CST:.*]] = arith.constant 0.0
89
- // CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1 , %arg1] , %[[CST ]], %arg2 {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space<workgroup>>, vector<4xf32>
105
+ // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
106
+ // CHECK: %[[RES:.*]] = vector.maskedload %[[ARG0]][%[[ARG1]] , %[[ARG1]]] , %[[ARG2 ]], %[[CST]]
90
107
// CHECK: return %[[RES]] : vector<4xf32>
91
108
92
109
// -----
@@ -103,10 +120,11 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space<fa
103
120
: memref <8 x8 xf32 , #amdgpu.address_space <fat_raw_buffer >>, vector <4 xf32 >
104
121
return %res : vector <4 xf32 >
105
122
}
106
- // CHECK: %[[CST:.*]] = arith.constant 0.0
107
- // CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
123
+ // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
124
+ // CHECK: %[[FALSE:.*]] = arith.constant false
125
+ // CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
108
126
// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
109
- // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT ]]
127
+ // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST ]]
110
128
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32>
111
129
112
130
// -----
@@ -122,7 +140,8 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
122
140
: memref <8 x8 xf32 , #amdgpu.address_space <fat_raw_buffer >>, vector <1 xf32 >
123
141
return %res : vector <1 xf32 >
124
142
}
125
- // CHECK: %[[CST:.*]] = arith.constant 0.0
126
- // CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
127
- // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
128
- // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]]
143
+ // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
144
+ // CHECK: %[[FALSE:.*]] = arith.constant false
145
+ // CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<1xf32>) {
146
+ // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]]
147
+ // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
0 commit comments