Skip to content

[gguf] Add descriptions to quantization types #615

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/gguf/src/gguf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { GGUFValueType } from "./types";

export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
export { GGUFValueType, GGMLQuantizationType } from "./types";
export { QUANT_DESCRIPTIONS } from "./quant-descriptions";

export const RE_GGUF_FILE = /\.gguf$/;
export const RE_GGUF_SHARD_FILE = /-(\d{5})-of-(\d{5})\.gguf$/;
Expand Down
98 changes: 98 additions & 0 deletions packages/gguf/src/quant-descriptions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { GGMLQuantizationType } from "./types";

export const QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string; src_url?: string }> = {
[GGMLQuantizationType.F32]: {
txt: "32-bit standard IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format",
},
[GGMLQuantizationType.F16]: {
txt: "16-bit standard IEEE 754 half-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format",
},
[GGMLQuantizationType.Q4_0]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[GGMLQuantizationType.Q4_1]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[GGMLQuantizationType.Q5_0]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[GGMLQuantizationType.Q5_1]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[GGMLQuantizationType.Q8_0]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[GGMLQuantizationType.Q8_1]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today)",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[GGMLQuantizationType.Q2_K]: {
txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.5625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.Q3_K]: {
txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.Q4_K]: {
txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.Q5_K]: {
txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.Q6_K]: {
txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.Q8_K]: {
txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[GGMLQuantizationType.IQ2_XXS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ2_XS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ3_XXS]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ1_S]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ4_NL]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix",
},
[GGMLQuantizationType.IQ3_S]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ2_S]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[GGMLQuantizationType.IQ4_XS]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
};