@@ -57,35 +57,36 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
57
57
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
58
58
59
59
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
60
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
60
61
GGML_UNUSED (Q_v);
61
62
62
63
T sum = 0 .0f ;
63
64
64
65
#pragma unroll
65
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += WARP_SIZE ) {
66
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += warp_size ) {
66
67
const int k_KQ = k_KQ_0 + threadIdx .x ;
67
68
68
69
const int ib = k_KQ / QI8_1;
69
70
const int iqs4 = k_KQ % QI4_0;
70
71
const int shift = k_KQ & (QI8_1/2 );
71
72
72
73
const int v = (get_int_b2 (K_q4_0[ib].qs , iqs4) >> shift) & 0x0F0F0F0F ;
73
- const int u = Q_q8[k_KQ_0/WARP_SIZE ];
74
+ const int u = Q_q8[k_KQ_0/warp_size ];
74
75
75
76
const int sumi = ggml_cuda_dp4a (v, u, 0 );
76
77
77
78
#ifdef FP16_AVAILABLE
78
79
if (std::is_same<T, half>::value) {
79
80
const half2 * Q_ds = (const half2 *) Q_ds_v;
80
81
81
- const half2 sum2 = __half2half2 (K_q4_0[ib].d ) * Q_ds[k_KQ_0/WARP_SIZE ];
82
+ const half2 sum2 = __half2half2 (K_q4_0[ib].d ) * Q_ds[k_KQ_0/warp_size ];
82
83
sum += (T) (((half) sumi)*__low2half (sum2) - __high2half (sum2) /* *8/QI8_1 == 1 */ );
83
84
} else
84
85
#endif // FP16_AVAILABLE
85
86
{
86
87
const float2 * Q_ds = (const float2 *) Q_ds_v;
87
88
88
- sum += (T) (__half2float (K_q4_0[ib].d ) * (sumi*Q_ds[k_KQ_0/WARP_SIZE ].x - (8 /QI8_1)*Q_ds[k_KQ_0/WARP_SIZE ].y ));
89
+ sum += (T) (__half2float (K_q4_0[ib].d ) * (sumi*Q_ds[k_KQ_0/warp_size ].x - (8 /QI8_1)*Q_ds[k_KQ_0/warp_size ].y ));
89
90
}
90
91
}
91
92
@@ -97,37 +98,38 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
97
98
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
98
99
99
100
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
101
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
100
102
GGML_UNUSED (Q_v);
101
103
102
104
T sum = 0 .0f ;
103
105
104
106
#pragma unroll
105
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += WARP_SIZE ) {
107
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += warp_size ) {
106
108
const int k_KQ = k_KQ_0 + threadIdx .x ;
107
109
108
110
const int ib = k_KQ / QI8_1;
109
111
const int iqs4 = k_KQ % QI4_1;
110
112
const int shift = k_KQ & (QI8_1/2 );
111
113
112
114
const int v = (get_int_b4 (K_q4_1[ib].qs , iqs4) >> shift) & 0x0F0F0F0F ;
113
- const int u = Q_q8[k_KQ_0/WARP_SIZE ];
115
+ const int u = Q_q8[k_KQ_0/warp_size ];
114
116
115
117
const int sumi = ggml_cuda_dp4a (v, u, 0 );
116
118
117
119
#ifdef FP16_AVAILABLE
118
120
if (std::is_same<T, half>::value) {
119
121
const half2 * Q_ds = (const half2 *) Q_ds_v;
120
122
121
- const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE ];
123
+ const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/warp_size ];
122
124
const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2 (sumi, 1 .0f /QI8_1);
123
125
sum += (T) (__low2half (sumid4d8_m4s8scaled) + __high2half (sumid4d8_m4s8scaled));
124
126
} else
125
127
#endif // FP16_AVAILABLE
126
128
{
127
129
const float2 * Q_ds = (const float2 *) Q_ds_v;
128
130
129
- const float sumid4d8 = __low2float (K_q4_1[ib].dm )*Q_ds[k_KQ_0/WARP_SIZE ].x * sumi;
130
- const float m4s8scaled = __high2float (K_q4_1[ib].dm )*Q_ds[k_KQ_0/WARP_SIZE ].y / QI8_1;
131
+ const float sumid4d8 = __low2float (K_q4_1[ib].dm )*Q_ds[k_KQ_0/warp_size ].x * sumi;
132
+ const float m4s8scaled = __high2float (K_q4_1[ib].dm )*Q_ds[k_KQ_0/warp_size ].y / QI8_1;
131
133
132
134
sum += (T) (sumid4d8 + m4s8scaled);
133
135
}
@@ -141,12 +143,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
141
143
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
142
144
143
145
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
146
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
144
147
GGML_UNUSED (Q_v);
145
148
146
149
T sum = 0 .0f ;
147
150
148
151
#pragma unroll
149
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += WARP_SIZE ) {
152
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += warp_size ) {
150
153
const int k_KQ = k_KQ_0 + threadIdx .x ;
151
154
152
155
const int ib = k_KQ / QI8_1;
@@ -161,22 +164,22 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
161
164
v |= (vh << 18 ) & 0x00100000 ; // 2 -> 20
162
165
v |= (vh << 25 ) & 0x10000000 ; // 3 -> 28
163
166
164
- const int u = Q_q8[k_KQ_0/WARP_SIZE ];
167
+ const int u = Q_q8[k_KQ_0/warp_size ];
165
168
166
169
const int sumi = ggml_cuda_dp4a (v, u, 0 );
167
170
168
171
#ifdef FP16_AVAILABLE
169
172
if (std::is_same<T, half>::value) {
170
173
const half2 * Q_ds = (const half2 *) Q_ds_v;
171
174
172
- const half2 sum2 = __half2half2 (K_q5_0[ib].d ) * Q_ds[k_KQ_0/WARP_SIZE ];
175
+ const half2 sum2 = __half2half2 (K_q5_0[ib].d ) * Q_ds[k_KQ_0/warp_size ];
173
176
sum += (T) (((half) sumi)*__low2half (sum2) - __high2half (sum2)*__float2half (2 .0f )) /* *16/QI8_1 == 2 */ ;
174
177
} else
175
178
#endif // FP16_AVAILABLE
176
179
{
177
180
const float2 * Q_ds = (const float2 *) Q_ds_v;
178
181
179
- sum += (T) (__half2float (K_q5_0[ib].d ) * (sumi*Q_ds[k_KQ_0/WARP_SIZE ].x - (16 /QI8_1)*Q_ds[k_KQ_0/WARP_SIZE ].y ));
182
+ sum += (T) (__half2float (K_q5_0[ib].d ) * (sumi*Q_ds[k_KQ_0/warp_size ].x - (16 /QI8_1)*Q_ds[k_KQ_0/warp_size ].y ));
180
183
}
181
184
}
182
185
@@ -188,12 +191,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
188
191
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
189
192
190
193
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
194
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
191
195
GGML_UNUSED (Q_v);
192
196
193
197
T sum = 0 .0f ;
194
198
195
199
#pragma unroll
196
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += WARP_SIZE ) {
200
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += warp_size ) {
197
201
const int k_KQ = k_KQ_0 + threadIdx .x ;
198
202
199
203
const int ib = k_KQ / QI8_1;
@@ -208,24 +212,24 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
208
212
v |= (vh << 18 ) & 0x00100000 ; // 2 -> 20
209
213
v |= (vh << 25 ) & 0x10000000 ; // 3 -> 28
210
214
211
- const int u = Q_q8[k_KQ_0/WARP_SIZE ];
215
+ const int u = Q_q8[k_KQ_0/warp_size ];
212
216
213
217
const int sumi = ggml_cuda_dp4a (v, u, 0 );
214
218
215
219
#ifdef FP16_AVAILABLE
216
220
if (std::is_same<T, half>::value) {
217
221
const half2 * Q_ds = (const half2 *) Q_ds_v;
218
222
219
- const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE ];
223
+ const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/warp_size ];
220
224
const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2 (sumi, 1 .0f /QI8_1);
221
225
sum += (T) (__low2half (sumid5d8_m5s8scaled) + __high2half (sumid5d8_m5s8scaled));
222
226
} else
223
227
#endif // FP16_AVAILABLE
224
228
{
225
229
const float2 * Q_ds = (const float2 *) Q_ds_v;
226
230
227
- const float sumid5d8 = __low2float (K_q5_1[ib].dm )*Q_ds[k_KQ_0/WARP_SIZE ].x * sumi;
228
- const float m5s8scaled = __high2float (K_q5_1[ib].dm )*Q_ds[k_KQ_0/WARP_SIZE ].y / QI8_1;
231
+ const float sumid5d8 = __low2float (K_q5_1[ib].dm )*Q_ds[k_KQ_0/warp_size ].x * sumi;
232
+ const float m5s8scaled = __high2float (K_q5_1[ib].dm )*Q_ds[k_KQ_0/warp_size ].y / QI8_1;
229
233
230
234
sum += (T) (sumid5d8 + m5s8scaled);
231
235
}
@@ -239,12 +243,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
239
243
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
240
244
241
245
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
246
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
242
247
GGML_UNUSED (Q_v);
243
248
244
249
T sum = 0 .0f ;
245
250
246
251
#pragma unroll
247
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += WARP_SIZE ) {
252
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/sizeof (int ); k_KQ_0 += warp_size ) {
248
253
const int k_KQ = k_KQ_0 + threadIdx .x ;
249
254
250
255
const int ib = k_KQ / QI8_0;
@@ -255,13 +260,13 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
255
260
T Q_d;
256
261
if (std::is_same<T, half>::value) {
257
262
const half2 * Q_ds = (const half2 *) Q_ds_v;
258
- Q_d = __low2half (Q_ds[k_KQ_0/WARP_SIZE ]);
263
+ Q_d = __low2half (Q_ds[k_KQ_0/warp_size ]);
259
264
} else {
260
265
const float2 * Q_ds = (const float2 *) Q_ds_v;
261
- Q_d = Q_ds[k_KQ_0/WARP_SIZE ].x ;
266
+ Q_d = Q_ds[k_KQ_0/warp_size ].x ;
262
267
}
263
268
264
- sum += vec_dot_q8_0_q8_1_impl<T, 1 >(&v, &Q_q8[k_KQ_0/WARP_SIZE ], K_q8_0[ib].d , Q_d);
269
+ sum += vec_dot_q8_0_q8_1_impl<T, 1 >(&v, &Q_q8[k_KQ_0/warp_size ], K_q8_0[ib].d , Q_d);
265
270
}
266
271
267
272
return sum;
@@ -272,6 +277,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
272
277
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
273
278
274
279
const half2 * K_h2 = (const half2 *) K_c;
280
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size ();
275
281
GGML_UNUSED (Q_q8);
276
282
GGML_UNUSED (Q_ds_v);
277
283
@@ -282,11 +288,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
282
288
half2 sum2 = make_half2 (0 .0f , 0 .0f );
283
289
284
290
#pragma unroll
285
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/2 ; k_KQ_0 += WARP_SIZE ) {
291
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/2 ; k_KQ_0 += warp_size ) {
286
292
const int k_KQ = k_KQ_0 + threadIdx .x ;
287
293
288
294
const half2 K_ik = K_h2[k_KQ];
289
- sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE ];
295
+ sum2 += K_ik * Q_h2[k_KQ_0/warp_size ];
290
296
}
291
297
292
298
return __low2half (sum2) + __high2half (sum2);
@@ -298,12 +304,12 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
298
304
float sum = 0 .0f ;
299
305
300
306
#pragma unroll
301
- for (int k_KQ_0 = 0 ; k_KQ_0 < D/2 ; k_KQ_0 += WARP_SIZE ) {
307
+ for (int k_KQ_0 = 0 ; k_KQ_0 < D/2 ; k_KQ_0 += warp_size ) {
302
308
const int k_KQ = k_KQ_0 + threadIdx .x ;
303
309
304
310
const half2 K_ik = K_h2[k_KQ];
305
- sum += __low2float (K_ik) * Q_f2[k_KQ_0/WARP_SIZE ].x ;
306
- sum += __high2float (K_ik) * Q_f2[k_KQ_0/WARP_SIZE ].y ;
311
+ sum += __low2float (K_ik) * Q_f2[k_KQ_0/warp_size ].x ;
312
+ sum += __high2float (K_ik) * Q_f2[k_KQ_0/warp_size ].y ;
307
313
}
308
314
309
315
return sum;
@@ -698,6 +704,8 @@ void launch_fattn(
698
704
699
705
GGML_ASSERT (Q->ne [3 ] == 1 );
700
706
707
+ const int warp_size = ggml_cuda_info ().devices [ctx.device ].warp_size ;
708
+
701
709
ggml_cuda_pool & pool = ctx.pool ();
702
710
cudaStream_t main_stream = ctx.stream ();
703
711
const int id = ggml_cuda_get_device ();
@@ -750,7 +758,7 @@ void launch_fattn(
750
758
const int ntiles_x = ((Q->ne [1 ] + ncols1 - 1 ) / ncols1);
751
759
const int ntiles_total = ntiles_x * (Q->ne [2 ] / ncols2) * Q->ne [3 ];
752
760
753
- const dim3 block_dim (WARP_SIZE , nwarps, 1 );
761
+ const dim3 block_dim (warp_size , nwarps, 1 );
754
762
dim3 blocks_num;
755
763
if (parallel_blocks == 0 ) {
756
764
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
@@ -796,6 +804,8 @@ void launch_fattn(
796
804
const float m0 = powf (2 .0f , -(max_bias ) / n_head_log2);
797
805
const float m1 = powf (2 .0f , -(max_bias / 2 .0f ) / n_head_log2);
798
806
807
+ GGML_ASSERT (block_dim.x % warp_size == 0 );
808
+ GGML_ASSERT (!GGML_CUDA_CC_IS_AMD (cc) || block_dim.x * block_dim.y <= 4 * (unsigned int )warp_size);
799
809
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>> (
800
810
(const char *) Q->data ,
801
811
K_data,
0 commit comments