@@ -80,192 +80,227 @@ typedef struct {
80
80
} block_q8_0;
81
81
static_assert (sizeof (block_q8_0) == sizeof(float ) + QK8_0, "wrong q8_0 block size/padding");
82
82
83
- static __global__ void dequantize_block_q4_0 (const void * vx, float * y) {
83
+ static __global__ void dequantize_block_q4_0 (const void * vx, float * y, int k ) {
84
84
const block_q4_0 * x = (const block_q4_0 *) vx;
85
85
86
- const int i = blockIdx .x ;
86
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
87
87
88
- const float d = x[i].d ;
88
+ if (i < k) {
89
+ const float d = x[i].d ;
89
90
90
- const uint8_t * pp = x[i].qs ;
91
+ const uint8_t * pp = x[i].qs ;
91
92
92
- for (int l = 0 ; l < QK4_0; l += 2 ) {
93
- const uint8_t vi = pp[l/2 ];
93
+ for (int l = 0 ; l < QK4_0; l += 2 ) {
94
+ const uint8_t vi = pp[l/2 ];
94
95
95
- const int8_t vi0 = vi & 0xf ;
96
- const int8_t vi1 = vi >> 4 ;
96
+ const int8_t vi0 = vi & 0xf ;
97
+ const int8_t vi1 = vi >> 4 ;
97
98
98
- const float v0 = (vi0 - 8 )*d;
99
- const float v1 = (vi1 - 8 )*d;
99
+ const float v0 = (vi0 - 8 )*d;
100
+ const float v1 = (vi1 - 8 )*d;
100
101
101
- y[i*QK4_0 + l + 0 ] = v0;
102
- y[i*QK4_0 + l + 1 ] = v1;
102
+ y[i*QK4_0 + l + 0 ] = v0;
103
+ y[i*QK4_0 + l + 1 ] = v1;
104
+ }
103
105
}
104
106
}
105
107
106
- static __global__ void dequantize_block_q4_1 (const void * vx, float * y) {
108
+ static __global__ void dequantize_block_q4_1 (const void * vx, float * y, int k ) {
107
109
const block_q4_1 * x = (const block_q4_1 *) vx;
108
110
109
- const int i = blockIdx .x ;
111
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
110
112
111
- const float d = x[i].d ;
112
- const float m = x[i].m ;
113
+ if (i < k) {
114
+ const float d = x[i].d ;
115
+ const float m = x[i].m ;
113
116
114
- const uint8_t * pp = x[i].qs ;
117
+ const uint8_t * pp = x[i].qs ;
115
118
116
- for (int l = 0 ; l < QK4_1; l += 2 ) {
117
- const uint8_t vi = pp[l/2 ];
119
+ for (int l = 0 ; l < QK4_1; l += 2 ) {
120
+ const uint8_t vi = pp[l/2 ];
118
121
119
- const int8_t vi0 = vi & 0xf ;
120
- const int8_t vi1 = vi >> 4 ;
122
+ const int8_t vi0 = vi & 0xf ;
123
+ const int8_t vi1 = vi >> 4 ;
121
124
122
- const float v0 = vi0*d + m;
123
- const float v1 = vi1*d + m;
125
+ const float v0 = vi0*d + m;
126
+ const float v1 = vi1*d + m;
124
127
125
- y[i*QK4_1 + l + 0 ] = v0;
126
- y[i*QK4_1 + l + 1 ] = v1;
128
+ y[i*QK4_1 + l + 0 ] = v0;
129
+ y[i*QK4_1 + l + 1 ] = v1;
130
+ }
127
131
}
128
132
}
129
133
130
- static __global__ void dequantize_block_q4_2 (const void * vx, float * y) {
134
+ static __global__ void dequantize_block_q4_2 (const void * vx, float * y, int k ) {
131
135
const block_q4_2 * x = (const block_q4_2 *) vx;
132
136
133
- const int i = blockIdx .x ;
137
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
134
138
135
- const float d = x[i].d ;
139
+ if (i < k) {
140
+ const float d = x[i].d ;
136
141
137
- const uint8_t * pp = x[i].qs ;
142
+ const uint8_t * pp = x[i].qs ;
138
143
139
- for (int l = 0 ; l < QK4_2; l += 2 ) {
140
- const uint8_t vi = pp[l/2 ];
144
+ for (int l = 0 ; l < QK4_2; l += 2 ) {
145
+ const uint8_t vi = pp[l/2 ];
141
146
142
- const int8_t vi0 = vi & 0xf ;
143
- const int8_t vi1 = vi >> 4 ;
147
+ const int8_t vi0 = vi & 0xf ;
148
+ const int8_t vi1 = vi >> 4 ;
144
149
145
- const float v0 = (vi0 - 8 )*d;
146
- const float v1 = (vi1 - 8 )*d;
150
+ const float v0 = (vi0 - 8 )*d;
151
+ const float v1 = (vi1 - 8 )*d;
147
152
148
- y[i*QK4_2 + l + 0 ] = v0;
149
- y[i*QK4_2 + l + 1 ] = v1;
153
+ y[i*QK4_2 + l + 0 ] = v0;
154
+ y[i*QK4_2 + l + 1 ] = v1;
155
+ }
150
156
}
151
157
}
152
158
153
- static __global__ void dequantize_block_q5_0 (const void * vx, float * y) {
159
+ static __global__ void dequantize_block_q5_0 (const void * vx, float * y, int k ) {
154
160
const block_q5_0 * x = (const block_q5_0 *) vx;
155
161
156
- const int i = blockIdx .x ;
162
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
157
163
158
- const float d = x[i].d ;
164
+ if (i < k) {
165
+ const float d = x[i].d ;
159
166
160
- const uint8_t * pp = x[i].qs ;
167
+ const uint8_t * pp = x[i].qs ;
161
168
162
- uint32_t qh;
163
- memcpy (&qh, x[i].qh , sizeof (qh));
169
+ uint32_t qh;
170
+ memcpy (&qh, x[i].qh , sizeof (qh));
164
171
165
- for (int l = 0 ; l < QK5_0; l += 2 ) {
166
- const uint8_t vi = pp[l/2 ];
172
+ for (int l = 0 ; l < QK5_0; l += 2 ) {
173
+ const uint8_t vi = pp[l/2 ];
167
174
168
- const int8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
169
- const int8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
175
+ const int8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
176
+ const int8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
170
177
171
- const int8_t vi0 = ((vi & 0xf ) | vh0);
172
- const int8_t vi1 = ((vi >> 4 ) | vh1);
178
+ const int8_t vi0 = ((vi & 0xf ) | vh0);
179
+ const int8_t vi1 = ((vi >> 4 ) | vh1);
173
180
174
- const float v0 = (vi0 - 16 )*d;
175
- const float v1 = (vi1 - 16 )*d;
181
+ const float v0 = (vi0 - 16 )*d;
182
+ const float v1 = (vi1 - 16 )*d;
176
183
177
- y[i*QK5_0 + l + 0 ] = v0;
178
- y[i*QK5_0 + l + 1 ] = v1;
184
+ y[i*QK5_0 + l + 0 ] = v0;
185
+ y[i*QK5_0 + l + 1 ] = v1;
186
+ }
179
187
}
180
188
}
181
189
182
- static __global__ void dequantize_block_q5_1 (const void * vx, float * y) {
190
+ static __global__ void dequantize_block_q5_1 (const void * vx, float * y, int k ) {
183
191
const block_q5_1 * x = (const block_q5_1 *) vx;
184
192
185
- const int i = blockIdx .x ;
193
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
186
194
187
- const float d = x[i].d ;
188
- const float m = x[i].m ;
195
+ if (i < k) {
196
+ const float d = x[i].d ;
197
+ const float m = x[i].m ;
189
198
190
- const uint8_t * pp = x[i].qs ;
199
+ const uint8_t * pp = x[i].qs ;
191
200
192
- uint32_t qh;
193
- memcpy (&qh, x[i].qh , sizeof (qh));
201
+ uint32_t qh;
202
+ memcpy (&qh, x[i].qh , sizeof (qh));
194
203
195
- for (int l = 0 ; l < QK5_1; l += 2 ) {
196
- const uint8_t vi = pp[l/2 ];
204
+ for (int l = 0 ; l < QK5_1; l += 2 ) {
205
+ const uint8_t vi = pp[l/2 ];
197
206
198
- const int8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
199
- const int8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
207
+ const int8_t vh0 = ((qh & (1 << (l + 0 ))) >> (l + 0 )) << 4 ;
208
+ const int8_t vh1 = ((qh & (1 << (l + 1 ))) >> (l + 1 )) << 4 ;
200
209
201
- const int8_t vi0 = (vi & 0xf ) | vh0;
202
- const int8_t vi1 = (vi >> 4 ) | vh1;
210
+ const int8_t vi0 = (vi & 0xf ) | vh0;
211
+ const int8_t vi1 = (vi >> 4 ) | vh1;
203
212
204
- const float v0 = vi0*d + m;
205
- const float v1 = vi1*d + m;
213
+ const float v0 = vi0*d + m;
214
+ const float v1 = vi1*d + m;
206
215
207
- y[i*QK5_1 + l + 0 ] = v0;
208
- y[i*QK5_1 + l + 1 ] = v1;
216
+ y[i*QK5_1 + l + 0 ] = v0;
217
+ y[i*QK5_1 + l + 1 ] = v1;
218
+ }
209
219
}
210
220
}
211
221
212
- static __global__ void dequantize_block_q8_0 (const void * vx, float * y) {
222
+ static __global__ void dequantize_block_q8_0 (const void * vx, float * y, int k ) {
213
223
const block_q8_0 * x = (const block_q8_0 *) vx;
214
224
215
- const int i = blockIdx .x ;
225
+ const int i = blockIdx .x * blockDim . x + threadIdx . x ;
216
226
217
- const float d = x[i].d ;
227
+ if (i < k) {
228
+ const float d = x[i].d ;
218
229
219
- const int8_t * pp = x[i].qs ;
230
+ const int8_t * pp = x[i].qs ;
220
231
221
- for (int l = 0 ; l < QK8_0; l++) {
222
- const int8_t vi = pp[l];
232
+ for (int l = 0 ; l < QK8_0; l++) {
233
+ const int8_t vi = pp[l];
223
234
224
- y[i*QK8_0 + l] = vi*d;
235
+ y[i*QK8_0 + l] = vi*d;
236
+ }
225
237
}
226
238
}
227
239
228
240
static void dequantize_row_q4_0_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
229
241
const int nb = k / QK4_0;
230
- dequantize_block_q4_0<<<nb, 1 , 0 , stream>>> (vx, y);
242
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
243
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q4_0, 0 , 0 ));
244
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
245
+ dequantize_block_q4_0<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
231
246
}
232
247
233
248
static void dequantize_row_q4_1_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
234
249
const int nb = k / QK4_1;
235
- dequantize_block_q4_1<<<nb, 1 , 0 , stream>>> (vx, y);
250
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
251
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q4_1, 0 , 0 ));
252
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
253
+ dequantize_block_q4_1<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
236
254
}
237
255
238
256
static void dequantize_row_q4_2_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
239
257
const int nb = k / QK4_2;
240
- dequantize_block_q4_2<<<nb, 1 , 0 , stream>>> (vx, y);
258
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
259
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q4_2, 0 , 0 ));
260
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
261
+ dequantize_block_q4_2<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
241
262
}
242
263
243
264
static void dequantize_row_q5_0_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
244
265
const int nb = k / QK5_0;
245
- dequantize_block_q5_0<<<nb, 1 , 0 , stream>>> (vx, y);
266
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
267
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q5_0, 0 , 0 ));
268
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
269
+ dequantize_block_q5_0<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
246
270
}
247
271
248
272
static void dequantize_row_q5_1_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
249
273
const int nb = k / QK5_1;
250
- dequantize_block_q5_1<<<nb, 1 , 0 , stream>>> (vx, y);
274
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
275
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q5_1, 0 , 0 ));
276
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
277
+ dequantize_block_q5_1<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
251
278
}
252
279
253
280
static void dequantize_row_q8_0_cuda (const void * vx, float * y, int k, cudaStream_t stream) {
254
281
const int nb = k / QK8_0;
255
- dequantize_block_q8_0<<<nb, 1 , 0 , stream>>> (vx, y);
282
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
283
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, dequantize_block_q8_0, 0 , 0 ));
284
+ int grid_size = (nb + block_size - 1 ) / block_size; // Round up.
285
+ dequantize_block_q8_0<<<grid_size, block_size, 0 , stream>>> (vx, y, k);
256
286
}
257
287
258
288
// TODO: optimize
259
- static __global__ void convert_fp16_to_fp32 (const void * vx, float * y) {
289
+ static __global__ void convert_fp16_to_fp32 (const void * vx, float * y, int k ) {
260
290
const half * x = (const half *) vx;
261
291
262
292
const int i = blockIdx .x ;
263
293
264
- y[i] = __half2float (x[i]);
294
+ if (i < k) {
295
+ y[i] = __half2float (x[i]);
296
+ }
265
297
}
266
298
267
299
static void convert_fp16_to_fp32_cuda (const void * x, float * y, int k, cudaStream_t stream) {
268
- convert_fp16_to_fp32<<<k, 1 , 0 , stream>>> (x, y);
300
+ int min_grid_size, block_size = 1 ; // Initialize to suppress compiler warning.
301
+ CUDA_CHECK (cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, convert_fp16_to_fp32, 0 , 0 ));
302
+ int grid_size = (k + block_size - 1 ) / block_size; // Round up.
303
+ convert_fp16_to_fp32<<<grid_size, block_size, 0 , stream>>> (x, y, k);
269
304
}
270
305
271
306
static to_fp32_cuda_t ggml_get_to_fp32_cuda (ggml_type type) {
0 commit comments