Skip to content

gguf : add findNearestQuantType #1421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 42 additions & 5 deletions packages/gguf/src/gguf.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import {
ggufAllShards,
parseGgufShardFilename,
parseGGUFQuantLabel,
GGUF_QUANT_ORDER,
findNearestQuantType,
} from "./gguf";
import fs from "node:fs";

Expand Down Expand Up @@ -46,7 +48,7 @@ describe("gguf", () => {
tensor_count: 291n,
kv_count: 19n,
"general.architecture": "llama",
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K,
"general.file_type": GGMLFileQuantizationType.Q2_K,
"general.name": "LLaMA v2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
Expand Down Expand Up @@ -105,7 +107,7 @@ describe("gguf", () => {
tensor_count: 291n,
kv_count: 24n,
"general.architecture": "llama",
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M,
"general.file_type": GGMLFileQuantizationType.Q5_K_M,
"general.name": "mistralai_mistral-7b-instruct-v0.2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
Expand Down Expand Up @@ -143,7 +145,7 @@ describe("gguf", () => {
tensor_count: 164n,
kv_count: 21n,
"general.architecture": "gemma",
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M,
"general.file_type": GGMLFileQuantizationType.Q4_K_M,
"general.name": "gemma-2b-it",
"general.quantization_version": 2,
"gemma.attention.head_count": 8,
Expand Down Expand Up @@ -180,7 +182,7 @@ describe("gguf", () => {
tensor_count: 197n,
kv_count: 23n,
"general.architecture": "bert",
"general.file_type": GGMLFileQuantizationType.MOSTLY_F16,
"general.file_type": GGMLFileQuantizationType.F16,
"general.name": "bge-small-en-v1.5",
"bert.attention.causal": false,
"bert.attention.head_count": 12,
Expand Down Expand Up @@ -280,12 +282,47 @@ describe("gguf", () => {
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q2_K.gguf")).toEqual("Q2_K");
expect(parseGGUFQuantLabel("Codestral-22B-v0.1.gguf")).toEqual(undefined);
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-F32-Q2_K.gguf")).toEqual("Q2_K"); // gguf name with two quant labels [F32, Q2_K]
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual(undefined); // TODO: investigate IQ3_XS
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual("IQ3_XS");
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q4_0_4_4.gguf")).toEqual("Q4_0"); // TODO: investigate Q4_0_4_4
});

it("calculate tensor data offset", async () => {
const { tensorDataOffset } = await gguf(URL_LLAMA);
expect(tensorDataOffset).toEqual(741056n);
});

// Quantization handler

it("should have GGUF_QUANT_ORDER in sync with GGMLQuantizationType enum", () => {
const enumValues = Object.values(GGMLQuantizationType).filter((value) => typeof value === "number") as number[];
const checkValues = new Set(GGUF_QUANT_ORDER);
for (const value of enumValues) {
expect(checkValues).toContain(value);
}
});

it("should find the nearest quant", () => {
const quant = GGMLFileQuantizationType.IQ2_M;
const availableQuants = [
GGMLFileQuantizationType.Q2_K,
GGMLFileQuantizationType.Q4_K_M,
GGMLFileQuantizationType.Q8_0,
];
const nearestQuant = findNearestQuantType(quant, availableQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q2_K);
});

it("should find the nearest quant (vision model)", () => {
const visionQuants = [GGMLFileQuantizationType.Q8_0, GGMLFileQuantizationType.F16, GGMLFileQuantizationType.BF16];
let nearestQuant;
// text = Q4_K_M
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q4_K_M, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
// text = Q8_0
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q8_0, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
// text = F16
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.F16, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.F16);
});
Comment on lines +315 to +327
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw @bartowski1182 , this test case is inspired from a real world scenario where we have vision quantized to F16/BF16/Q8_0, and the text can be anything else.

Feel free to suggest other test cases if you can think of any!

});
11 changes: 9 additions & 2 deletions packages/gguf/src/gguf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@ import { isBackend } from "./utils/isBackend";
import { promisesQueue } from "./utils/promisesQueue";

export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types";
export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types";
export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";
export { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL } from "@huggingface/tasks";
export {
parseGGUFQuantLabel,
GGUF_QUANT_RE,
GGUF_QUANT_RE_GLOBAL,
GGUF_QUANT_ORDER,
findNearestQuantType,
GGMLFileQuantizationType,
} from "@huggingface/tasks";

export const RE_GGUF_FILE = /\.gguf$/;
export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
Expand Down
12 changes: 10 additions & 2 deletions packages/gguf/src/quant-descriptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
},
[GGMLQuantizationType.TQ1_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
},
[GGMLQuantizationType.TQ2_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
},
};

const QK_K = 256;
Expand Down Expand Up @@ -163,6 +171,6 @@ export const GGML_QUANT_SIZES = {
[GGMLQuantizationType.F64]: calcBPW(1, 8),
[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
[GGMLQuantizationType.BF16]: calcBPW(1, 2),
// [GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
// [GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
[GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
[GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
};
41 changes: 1 addition & 40 deletions packages/gguf/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,52 +1,13 @@
import type { TransformerLLM } from "./transformer-llm";
import { LLM_ARCHITECTURES } from "./transformer-llm";
import type { GGMLQuantizationType } from "@huggingface/tasks";
import type { GGMLQuantizationType, GGMLFileQuantizationType } from "@huggingface/tasks";
export { GGMLQuantizationType } from "@huggingface/tasks";

export type MetadataBaseValue = string | number | bigint | boolean;
export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataValue[]; /// recursive as arrays can be nested.

export type Version = 1 | 2 | 3;

export enum GGMLFileQuantizationType {
MOSTLY_F32 = 0,
MOSTLY_F16 = 1,
MOSTLY_Q4_0 = 2,
MOSTLY_Q4_1 = 3,
MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// MOSTLY_Q4_2 = 5, // support has been removed
// MOSTLY_Q4_3 = 6, // support has been removed
MOSTLY_Q8_0 = 7,
MOSTLY_Q5_0 = 8,
MOSTLY_Q5_1 = 9,
MOSTLY_Q2_K = 10,
MOSTLY_Q3_K_S = 11,
MOSTLY_Q3_K_M = 12,
MOSTLY_Q3_K_L = 13,
MOSTLY_Q4_K_S = 14,
MOSTLY_Q4_K_M = 15,
MOSTLY_Q5_K_S = 16,
MOSTLY_Q5_K_M = 17,
MOSTLY_Q6_K = 18,
MOSTLY_IQ2_XXS = 19,
MOSTLY_IQ2_XS = 20,
MOSTLY_Q2_K_S = 21,
MOSTLY_IQ3_XS = 22,
MOSTLY_IQ3_XXS = 23,
MOSTLY_IQ1_S = 24,
MOSTLY_IQ4_NL = 25,
MOSTLY_IQ3_S = 26,
MOSTLY_IQ3_M = 27,
MOSTLY_IQ2_S = 28,
MOSTLY_IQ2_M = 29,
MOSTLY_IQ4_XS = 30,
MOSTLY_IQ1_M = 31,
MOSTLY_BF16 = 32,
MOSTLY_Q4_0_4_4 = 33,
MOSTLY_Q4_0_4_8 = 34,
MOSTLY_Q4_0_8_8 = 35,
}

export enum GGUFValueType {
UINT8 = 0,
INT8 = 1,
Expand Down
163 changes: 154 additions & 9 deletions packages/tasks/src/gguf.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,155 @@
// This list is copied from gguf/types.ts, but will all types available (for backward compatibility)
// NOT to be confused with GGMLQuantizationType, a FileQuantization can contain multiple GGMLQuantizationType
// For example, Q4_K_M model can contains Q4_K and Q6_K tensors
export enum GGMLFileQuantizationType {
F32 = 0,
F16 = 1,
Q4_0 = 2,
Q4_1 = 3,
Q4_1_SOME_F16 = 4,
Q4_2 = 5,
Q4_3 = 6,
Q8_0 = 7,
Q5_0 = 8,
Q5_1 = 9,
Q2_K = 10,
Q3_K_S = 11,
Q3_K_M = 12,
Q3_K_L = 13,
Q4_K_S = 14,
Q4_K_M = 15,
Q5_K_S = 16,
Q5_K_M = 17,
Q6_K = 18,
IQ2_XXS = 19,
IQ2_XS = 20,
Q2_K_S = 21,
IQ3_XS = 22,
IQ3_XXS = 23,
IQ1_S = 24,
IQ4_NL = 25,
IQ3_S = 26,
IQ3_M = 27,
IQ2_S = 28,
IQ2_M = 29,
IQ4_XS = 30,
IQ1_M = 31,
BF16 = 32,
Q4_0_4_4 = 33,
Q4_0_4_8 = 34,
Q4_0_8_8 = 35,
TQ1_0 = 36,
TQ2_0 = 37,
}

const ggufQuants = Object.values(GGMLFileQuantizationType).filter((v): v is string => typeof v === "string");
export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");

export function parseGGUFQuantLabel(fname: string): string | undefined {
const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
return quantLabel;
}

// order of quantization, from biggest to smallest
// this list must be in sync with the order in GGMLFileQuantizationType
// the gguf.spec.ts tests are using verify if the order is correct
export const GGUF_QUANT_ORDER: GGMLFileQuantizationType[] = [
Copy link
Collaborator

@gary149 gary149 May 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw interested in improving the ordering in the quant selector here: https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF?local-app=llama.cpp

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's a good idea. This list is already exported and ready to be used in hub UI, do you think of any other improvements ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I think we can start with it (maybe there's a few variants missing)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already sync'ed this list with latest llama.cpp code, so it should be good

GGMLFileQuantizationType.F32,
GGMLFileQuantizationType.BF16,
GGMLFileQuantizationType.F16,
GGMLFileQuantizationType.Q8_0,

// 6-bit quantizations
GGMLFileQuantizationType.Q6_K,

// 5-bit quantizations
GGMLFileQuantizationType.Q5_0,
GGMLFileQuantizationType.Q5_1,
GGMLFileQuantizationType.Q5_K_M,
GGMLFileQuantizationType.Q5_K_S,

// 4-bit quantizations
GGMLFileQuantizationType.Q4_K_M,
GGMLFileQuantizationType.Q4_K_S,
GGMLFileQuantizationType.IQ4_NL,
GGMLFileQuantizationType.IQ4_XS,
GGMLFileQuantizationType.Q4_0_4_4,
GGMLFileQuantizationType.Q4_0_4_8,
GGMLFileQuantizationType.Q4_0_8_8,
GGMLFileQuantizationType.Q4_0,
GGMLFileQuantizationType.Q4_1_SOME_F16,
GGMLFileQuantizationType.Q4_1,
GGMLFileQuantizationType.Q4_2,
GGMLFileQuantizationType.Q4_3,

// 3-bit quantizations
GGMLFileQuantizationType.Q3_K_L,
GGMLFileQuantizationType.Q3_K_M,
GGMLFileQuantizationType.Q3_K_S,
GGMLFileQuantizationType.IQ3_M,
GGMLFileQuantizationType.IQ3_S,
GGMLFileQuantizationType.IQ3_XS,
GGMLFileQuantizationType.IQ3_XXS,

// 2-bit quantizations
GGMLFileQuantizationType.Q2_K,
GGMLFileQuantizationType.Q2_K_S,
GGMLFileQuantizationType.IQ2_M,
GGMLFileQuantizationType.IQ2_S,
GGMLFileQuantizationType.IQ2_XS,
GGMLFileQuantizationType.IQ2_XXS,

// 1-bit quantizations
GGMLFileQuantizationType.IQ1_S,
GGMLFileQuantizationType.IQ1_M,
GGMLFileQuantizationType.TQ1_0,
GGMLFileQuantizationType.TQ2_0,
];

// This function finds the nearest quantization type that is less than or equal to the given quantization type.
// It returns undefined if no such quantization type is found.
export function findNearestQuantType(
Copy link
Member Author

@ngxson ngxson May 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fully disclosure: this function is written by gemini 2.5 pro 😂

quant: GGMLFileQuantizationType,
availableQuants: GGMLFileQuantizationType[]
): GGMLFileQuantizationType | undefined {
// Create a map for quick index lookup from the defined order
const orderMap = new Map<GGMLFileQuantizationType, number>();
GGUF_QUANT_ORDER.forEach((q, index) => {
orderMap.set(q, index);
});

const targetIndex = orderMap.get(quant) ?? 0; // the 0 case should never happen

// Filter the available quantizations to include only those defined in the order map,
// then sort them according to the GGUF_QUANT_ORDER (from largest/index 0 to smallest/highest index).
const sortedAvailable = availableQuants
.filter((q) => orderMap.has(q))
.sort((a, b) => (orderMap.get(a) ?? Infinity) - (orderMap.get(b) ?? Infinity));

// If no valid quantizations are available after filtering
if (sortedAvailable.length === 0) {
return undefined;
}

// Iterate through the sorted available quantizations (largest to smallest).
// Find the first one whose order index is >= the target index.
// This means finding the largest quantization that is smaller than or equal to the target.
for (const availableQuant of sortedAvailable) {
// We know the key exists due to the filter above.
const availableIndex = orderMap.get(availableQuant)!;
if (availableIndex >= targetIndex) {
return availableQuant;
}
}

// If the loop completes, it means all available quantizations are larger (have a smaller index)
// than the target quantization. In this case, return the "smallest" available quantization,
// which is the last element in the sorted list (highest index among available).
return sortedAvailable[sortedAvailable.length - 1];
}

// This list is only used to calculate the size of the model, NOT to be confused with the quantization FILE type
export enum GGMLQuantizationType {
F32 = 0,
F16 = 1,
Expand Down Expand Up @@ -28,13 +180,6 @@ export enum GGMLQuantizationType {
F64 = 28,
IQ1_M = 29,
BF16 = 30,
}

const ggufQuants = Object.values(GGMLQuantizationType).filter((v): v is string => typeof v === "string");
export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");

export function parseGGUFQuantLabel(fname: string): string | undefined {
const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
return quantLabel;
TQ1_0 = 34,
TQ2_0 = 35,
}