|
11 | 11 |
|
12 | 12 | #include "indexing_utils.h"
|
13 | 13 |
|
| 14 | +// To convince the SPIR-V compiler to unroll the loops optimally, need this |
| 15 | +// macro |
| 16 | +#define FOUR 4 |
| 17 | + |
| 18 | +#ifdef TILE_ROW_2 |
| 19 | +#define TILE_ROWS 2 |
| 20 | +#else |
| 21 | +#define TILE_ROWS 4 |
| 22 | +#endif |
| 23 | + |
| 24 | +struct FloatMatrix_2d { |
| 25 | + float data[TILE_ROWS][FOUR]; |
| 26 | +}; |
| 27 | + |
| 28 | +struct FloatMatrix_3d { |
| 29 | + float data[TILE_ROWS][FOUR][FOUR]; |
| 30 | +}; |
| 31 | + |
14 | 32 | // The functions in this file assume that some variables have been defined as
|
15 | 33 | // descriptors, such as t_mat1, t_qmat2, t_scales, etc.
|
16 | 34 |
|
@@ -77,6 +95,101 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
|
77 | 95 | return outtex;
|
78 | 96 | }
|
79 | 97 |
|
| 98 | +FloatMatrix_2d q_8w_linear_optimized_2d(const ivec3 out_pos, const int K) { |
| 99 | + FloatMatrix_2d results; |
| 100 | + |
| 101 | + VEC4_T im_mat1_partial_load[TILE_ROWS]; |
| 102 | + ivec4 im_mat2_partial_load[FOUR]; |
| 103 | + |
| 104 | + [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { |
| 105 | + [[unroll]] for (int j = 0; j < FOUR; j++) { results.data[i][j] = 0.0f; } |
| 106 | + } |
| 107 | + |
| 108 | + for (int mat1_x = 0; mat1_x < K; mat1_x++) { |
| 109 | + [[unroll]] for (int offset = 0; offset < TILE_ROWS; offset++) { |
| 110 | + const int mat1_y = out_pos.y * TILE_ROWS + offset; |
| 111 | + const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0); |
| 112 | + im_mat1_partial_load[offset] = load_texel(t_mat1, mat1_pos); |
| 113 | + } |
| 114 | + [[unroll]] for (int offset = 0; offset < FOUR; offset++) { |
| 115 | + const int mat2_y = (FOUR * out_pos.x) + offset; |
| 116 | + const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); |
| 117 | + im_mat2_partial_load[offset] = load_texel(t_qmat2, mat2_pos); |
| 118 | + } |
| 119 | + |
| 120 | + [[unroll]] for (int out_row = 0; out_row < TILE_ROWS; out_row++) { |
| 121 | + [[unroll]] for (int out_col = 0; out_col < FOUR; out_col++) { |
| 122 | + results.data[out_row][out_col] += |
| 123 | + dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); |
| 124 | + } |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0)); |
| 129 | + [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { |
| 130 | + [[unroll]] for (int j = 0; j < FOUR; j++) { |
| 131 | + results.data[i][j] *= scales[j]; |
| 132 | + } |
| 133 | + } |
| 134 | + return results; |
| 135 | +} |
| 136 | + |
| 137 | +FloatMatrix_3d q_8w_linear_optimized_3d( |
| 138 | + const ivec3 out_pos, |
| 139 | + const int K, |
| 140 | + const int batch_size) { |
| 141 | + FloatMatrix_3d results; |
| 142 | + |
| 143 | + [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { |
| 144 | + [[unroll]] for (int j = 0; j < FOUR; j++) { |
| 145 | + [[unroll]] for (int k = 0; k < FOUR; k++) { |
| 146 | + results.data[i][j][k] = 0.0f; |
| 147 | + } |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + VEC4_T im_mat1_partial_load[TILE_ROWS]; |
| 152 | + ivec4 im_mat2_partial_load[FOUR]; |
| 153 | + |
| 154 | + const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0)); |
| 155 | + |
| 156 | + for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { |
| 157 | + if (FOUR * out_pos.z + batch_idx >= batch_size) { |
| 158 | + break; |
| 159 | + } |
| 160 | + int mat_z = FOUR * out_pos.z + batch_idx; |
| 161 | + for (int mat1_x = 0; mat1_x < K; mat1_x++) { |
| 162 | + [[unroll]] for (int offset = 0; offset < TILE_ROWS; offset++) { |
| 163 | + // read and cache 2x4 (or 4x4) tile of im_mat1 |
| 164 | + const int mat1_y = (TILE_ROWS * out_pos.y) + offset; |
| 165 | + const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z); |
| 166 | + im_mat1_partial_load[offset] = load_texel(t_mat1, mat1_pos); |
| 167 | + } |
| 168 | + |
| 169 | + [[unroll]] for (int offset = 0; offset < FOUR; offset++) { |
| 170 | + // read and cache 4x4 tile of im_mat2 |
| 171 | + const int mat2_y = (FOUR * out_pos.x) + offset; |
| 172 | + const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); |
| 173 | + im_mat2_partial_load[offset] = load_texel(t_qmat2, mat2_pos); |
| 174 | + } |
| 175 | + |
| 176 | + [[unroll]] for (int out_row = 0; out_row < TILE_ROWS; out_row++) { |
| 177 | + [[unroll]] for (int out_col = 0; out_col < FOUR; out_col++) { |
| 178 | + results.data[out_row][out_col][batch_idx] += |
| 179 | + dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); |
| 180 | + } |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + [[unroll]] for (int i = 0; i < TILE_ROWS; i++) { |
| 185 | + [[unroll]] for (int j = 0; j < FOUR; j++) { |
| 186 | + results.data[i][j][batch_idx] *= scales[j]; |
| 187 | + } |
| 188 | + } |
| 189 | + } |
| 190 | + return results; |
| 191 | +} |
| 192 | + |
80 | 193 | #endif // USING_BUFFER
|
81 | 194 |
|
82 | 195 | #endif // Q_LINEAR_H
|
0 commit comments