-
Notifications
You must be signed in to change notification settings - Fork 12.2k
cuda: Add Q5_1, Q5_0, Q4_1 and Q4_0 to F32 conversion support. (#10976) #12000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
ggml/src/ggml-cuda/cpy.cu
Outdated
static void ggml_cpy_q5_1_f32_cuda( | ||
const char * cx, char * cdst, const int ne, | ||
const int ne00, const int ne01, const int ne02, | ||
const int nb00, const int nb01, const int nb02, | ||
const int nb03, const int ne10, const int ne11, const int ne12, | ||
const int nb10, const int nb11, const int nb12, const int nb13, | ||
cudaStream_t stream) { | ||
const int num_blocks = ne; | ||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>( | ||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, | ||
ne10, ne11, ne12, nb10, nb11, nb12, nb13); | ||
} | ||
|
||
static void ggml_cpy_q5_0_f32_cuda( | ||
const char * cx, char * cdst, const int ne, | ||
const int ne00, const int ne01, const int ne02, | ||
const int nb00, const int nb01, const int nb02, | ||
const int nb03, const int ne10, const int ne11, const int ne12, | ||
const int nb10, const int nb11, const int nb12, const int nb13, | ||
cudaStream_t stream) { | ||
const int num_blocks = ne; | ||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>( | ||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, | ||
ne10, ne11, ne12, nb10, nb11, nb12, nb13); | ||
} | ||
|
||
static void ggml_cpy_q4_1_f32_cuda( | ||
const char * cx, char * cdst, const int ne, | ||
const int ne00, const int ne01, const int ne02, | ||
const int nb00, const int nb01, const int nb02, | ||
const int nb03, const int ne10, const int ne11, const int ne12, | ||
const int nb10, const int nb11, const int nb12, const int nb13, | ||
cudaStream_t stream) { | ||
const int num_blocks = ne; | ||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>( | ||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, | ||
ne10, ne11, ne12, nb10, nb11, nb12, nb13); | ||
} | ||
|
||
static void ggml_cpy_q4_0_f32_cuda( | ||
const char * cx, char * cdst, const int ne, | ||
const int ne00, const int ne01, const int ne02, | ||
const int nb00, const int nb01, const int nb02, | ||
const int nb03, const int ne10, const int ne11, const int ne12, | ||
const int nb10, const int nb11, const int nb12, const int nb13, | ||
cudaStream_t stream) { | ||
const int num_blocks = ne; | ||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>( | ||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, | ||
ne10, ne11, ne12, nb10, nb11, nb12, nb13); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please keep the order of quants consistent. The order that I usually use for CUDA code is q4_0, q4_1, q5_0, q5_1, q8_0.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some of the existing code doesn't respect this order. I think it's better to not clean that (the existing code) up in the same patch though, as it would just add noise for reviewing. Can be done in a follow-up if you want.
@JohannesGaessler Incorporated your comments, plus reorder the newly added functions as requested. The order in the file is not totally consistent now, but as said above I think it's better to address that in a follow-up. |
Using templates and reusing the
dequant_qX_Y
functions.