Skip to content

Commit 1ec2080

Browse files
authored
llava: add quantization for the visual projector LLAVA, Qwen2VL (#11644)
* Added quantization for visual projector * Added README * Fixed the clip quantize implementation in the file * Fixed the gcc warning regarding minor linting * Removed trailing whitespace
1 parent 9f4cc8f commit 1ec2080

File tree

4 files changed

+113
-5
lines changed

4 files changed

+113
-5
lines changed

examples/llava/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
5050
install(TARGETS ${TARGET} RUNTIME)
5151
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
5252
target_compile_features(${TARGET} PRIVATE cxx_std_17)
53+
54+
set(TARGET llama-llava-clip-quantize-cli)
55+
add_executable(${TARGET} clip-quantize-cli.cpp)
56+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
57+
install(TARGETS ${TARGET} RUNTIME)
58+
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
59+
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/llava/README-quantize.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Quantizing CLIP Visual Projector
2+
3+
This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
4+
5+
## Usage
6+
7+
To quantize a CLIP visual projector model, use the following command:
8+
9+
```sh
10+
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
11+
```
12+
13+
After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
14+
15+
### Arguments
16+
17+
- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
18+
- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
19+
- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
20+
21+
### Quantization Types
22+
23+
The following quantization types are supported, based on the `enum ggml_type` definition:
24+
25+
- `2` - `q4_0`: 4-bit quantization with a single scale value.
26+
- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
27+
- `6` - `q5_0`: 5-bit quantization with a single scale value.
28+
- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
29+
- `8` - `q8_0`: 8-bit quantization with a single scale value.
30+
31+
### Example
32+
33+
To quantize a model using the `q4_0` quantization type, you would run:
34+
35+
```sh
36+
./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
37+
```
38+
39+
This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
40+
41+
## Notes
42+
43+
- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
44+
- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.

examples/llava/clip-quantize-cli.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#include "arg.h"
2+
#include "base64.hpp"
3+
#include "log.h"
4+
#include "common.h"
5+
#include "sampling.h"
6+
#include "clip.h"
7+
#include "llava.h"
8+
#include "llama.h"
9+
#include "ggml.h"
10+
11+
static void print_usage(int argc, char ** argv) {
12+
(void) argc;
13+
14+
fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
15+
fprintf(stderr, " type = 2 - q4_0\n");
16+
fprintf(stderr, " type = 3 - q4_1\n");
17+
fprintf(stderr, " type = 6 - q5_0\n");
18+
fprintf(stderr, " type = 7 - q5_1\n");
19+
fprintf(stderr, " type = 8 - q8_0\n");
20+
}
21+
22+
int main(int argc, char ** argv) {
23+
if (argc != 4) {
24+
print_usage(argc, argv);
25+
return 1;
26+
}
27+
28+
const std::string fname_inp = argv[1];
29+
const std::string fname_out = argv[2];
30+
31+
const int itype = atoi(argv[3]);
32+
33+
const int64_t t_main_start_us = ggml_time_us();
34+
35+
int64_t t_quantize_us = 0;
36+
37+
// load the model
38+
{
39+
const int64_t t_start_us = ggml_time_us();
40+
41+
if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
42+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
43+
return 1;
44+
}
45+
46+
t_quantize_us = ggml_time_us() - t_start_us;
47+
}
48+
49+
// report timing
50+
{
51+
const int64_t t_main_end_us = ggml_time_us();
52+
53+
printf("\n");
54+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
55+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
56+
}
57+
58+
return 0;
59+
}

examples/llava/clip.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2745,10 +2745,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27452745
}
27462746

27472747
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
2748-
ggml_type type = GGML_TYPE_Q4_1;
2749-
27502748
assert(itype < GGML_TYPE_COUNT);
2751-
type = static_cast<ggml_type>(itype);
2749+
ggml_type type = static_cast<ggml_type>(itype);
27522750

27532751
auto * ctx_clip = clip_model_load(fname_inp, 2);
27542752

@@ -2801,8 +2799,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
28012799
}
28022800
}
28032801

2804-
// quantize only 2D tensors
2805-
quantize &= (ggml_n_dims(cur) == 2);
2802+
// quantize only 2D tensors and bigger than block size
2803+
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
28062804

28072805
if (quantize) {
28082806
new_type = type;

0 commit comments

Comments
 (0)