huggingface
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/gguf/package.json
Lines changed: 1 addition & 1 deletion b/‎packages/gguf/package.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/gguf/src/cli.ts
Lines changed: 141 additions & 26 deletions b/‎packages/gguf/src/cli.ts
Lines changed: 141 additions & 26 deletions
diff --git a/‎packages/gguf/src/gguf.ts
Lines changed: 1 addition & 0 deletions b/‎packages/gguf/src/gguf.ts
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/gguf/src/quant-descriptions.ts
Lines changed: 41 additions & 0 deletions b/‎packages/gguf/src/quant-descriptions.ts
Lines changed: 41 additions & 0 deletions
diff --git a/‎packages/hub/package.json
Lines changed: 1 addition & 1 deletion b/‎packages/hub/package.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/hub/src/lib/download-file-to-cache-dir.spec.ts
Lines changed: 41 additions & 1 deletion b/‎packages/hub/src/lib/download-file-to-cache-dir.spec.ts
Lines changed: 41 additions & 1 deletion
diff --git a/‎packages/hub/src/lib/download-file-to-cache-dir.ts
Lines changed: 3 additions & 0 deletions b/‎packages/hub/src/lib/download-file-to-cache-dir.ts
Lines changed: 3 additions & 0 deletions
@@ -97,7 +97,7 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
 ```html
 <script type="module">
     import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm';
-    import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected].1/+esm";
+    import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected].2/+esm";
 </script>
 ```
 
 
@@ -1,7 +1,7 @@
 {
 	"name": "@huggingface/gguf",
 	"packageManager": "[email protected]",
-	"version": "0.1.13",
+	"version": "0.1.14",
 	"description": "a GGUF parser that works on remotely hosted files",
 	"repository": "https://github.com/huggingface/huggingface.js.git",
 	"publishConfig": {
 
@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 
-import { GGMLQuantizationType, gguf } from ".";
+import { GGMLQuantizationType, gguf, ggufAllShards, GGUFParseOutput } from ".";
+import { GGML_QUANT_SIZES } from "./quant-descriptions";
 
 interface PrintColumnHeader {
 	name: string;
@@ -10,11 +11,44 @@ interface PrintColumnHeader {
 
 const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
 
+function showHelp(exitCode: number) {
+	console.error("Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>");
+	console.error("  --help, -h        Show this help message");
+	console.error("  --show-tensor     Show tensor information");
+	console.error("  --context, -c N   Number of tokens in context (default: 4096)");
+	process.exit(exitCode);
+}
+
 async function main() {
-	const ggufPath = process.argv[2];
-	const { metadata, tensorInfos } = await gguf(ggufPath, {
+	let ggufPath = "";
+	let showTensors = false;
+	let nCtx = 4096;
+	for (let i = 2; i < process.argv.length; i++) {
+		if (process.argv[i] === "--help" || process.argv[i] === "-h") {
+			showHelp(0);
+		} else if (process.argv[i] === "--show-tensor") {
+			showTensors = true;
+		} else if (process.argv[i] === "--context" || process.argv[i] === "-c") {
+			nCtx = Number(process.argv[++i]);
+		} else {
+			ggufPath = process.argv[i];
+		}
+	}
+
+	if (!ggufPath.length) {
+		console.error("Error: Missing path to gguf file");
+		showHelp(1);
+	}
+
+	const { shards } = await ggufAllShards(ggufPath, {
 		allowLocalFile: true,
 	});
+	const { metadata, tensorInfos } = shards[0];
+
+	// merge all metadata
+	for (let i = 1; i < shards.length; i++) {
+		tensorInfos.push(...shards[i].tensorInfos);
+	}
 
 	// TODO: print info about endianess
 	console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
@@ -43,29 +77,110 @@ async function main() {
 	);
 
 	console.log();
-	console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
-	printTable(
-		[
-			{ name: "Idx", alignRight: true },
-			{ name: "Num Elements", alignRight: true },
-			{ name: "Shape" },
-			{ name: "Data Type" },
-			{ name: "Name" },
-		],
-		tensorInfos.map((tensorInfo, i) => {
-			const shape = [1n, 1n, 1n, 1n];
-			tensorInfo.shape.forEach((dim, i) => {
-				shape[i] = dim;
-			});
-			return [
-				(i + 1).toString(),
-				shape.reduce((acc, n) => acc * n, 1n).toString(),
-				shape.map((n) => n.toString().padStart(6)).join(", "),
-				mapDtypeToName[tensorInfo.dtype],
-				tensorInfo.name,
-			];
-		})
-	);
+	console.log(`* Memory usage estimation (with context length of ${nCtx} tokens)`);
+	try {
+		const kvUsage = calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], nCtx);
+		let modelWeightInBytes = 0;
+		for (const tensorInfo of tensorInfos) {
+			const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n));
+			const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8);
+			modelWeightInBytes += tensorSizeInBytes;
+		}
+		const overhead =
+			calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], 256).totalBytes +
+			modelWeightInBytes * 0.05;
+		const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes;
+		printTable(
+			[{ name: "Item" }, { name: "Memory usage", alignRight: true }],
+			[
+				["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"],
+				["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"],
+				["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"],
+				["Overhead", (overhead / 1e9).toFixed(2) + " GB"],
+				["", "---"],
+				["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"],
+			]
+		);
+	} catch (e) {
+		console.error(`Error: ${(e as Error).message}`);
+	}
+
+	if (showTensors) {
+		console.log();
+		console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
+		printTable(
+			[
+				{ name: "Idx", alignRight: true },
+				{ name: "Num Elements", alignRight: true },
+				{ name: "Shape" },
+				{ name: "Data Type" },
+				{ name: "Name" },
+			],
+			tensorInfos.map((tensorInfo, i) => {
+				const shape = [1n, 1n, 1n, 1n];
+				tensorInfo.shape.forEach((dim, i) => {
+					shape[i] = dim;
+				});
+				return [
+					(i + 1).toString(),
+					shape.reduce((acc, n) => acc * n, 1n).toString(),
+					shape.map((n) => n.toString().padStart(6)).join(", "),
+					mapDtypeToName[tensorInfo.dtype],
+					tensorInfo.name,
+				];
+			})
+		);
+	} else {
+		console.log();
+		console.log(`* Use --show-tensor to display tensor information`);
+	}
+}
+
+function calcMemoryUsage(
+	metadata: GGUFParseOutput<{ strict: false }>["metadata"],
+	kvSize: number,
+	kvTypeK: GGMLQuantizationType = GGMLQuantizationType.F16,
+	kvTypeV: GGMLQuantizationType = GGMLQuantizationType.F16
+) {
+	const arch = metadata["general.architecture"] ?? "unknown";
+	const n_embd = (metadata[`${arch}.embedding_length`] as number) ?? 0;
+	const n_head = (metadata[`${arch}.attention.head_count`] as number) ?? 0;
+	const n_embd_head_k = (metadata[`${arch}.attention.key_length`] as number) ?? n_embd / n_head;
+	const n_embd_head_v = (metadata[`${arch}.attention.value_length`] as number) ?? n_embd / n_head;
+	const n_head_kv = (metadata[`${arch}.attention.head_count_kv`] as number[] | number) ?? [];
+	const n_layer = (metadata[`${arch}.block_count`] as number) ?? 0;
+
+	if (arch.startsWith("mamba") || arch.startsWith("rwkv")) {
+		throw new Error(`Memory usage estimation for arch "${arch}" is not supported`);
+	}
+
+	const n_head_kv_arr = Array(n_layer).fill(n_head);
+	if (Array.isArray(n_head_kv)) {
+		for (let i = 0; i < n_layer; i++) {
+			if (n_head_kv[i]) {
+				n_head_kv_arr[i] = n_head_kv[i];
+			}
+		}
+	} else {
+		for (let i = 0; i < n_layer; i++) {
+			n_head_kv_arr[i] = n_head_kv;
+		}
+	}
+
+	let totalElemsK = 0;
+	let totalElemsV = 0;
+	for (let i = 0; i < n_layer; i++) {
+		const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i];
+		const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i];
+		totalElemsK += n_embd_k_gqa * kvSize;
+		totalElemsV += n_embd_v_gqa * kvSize;
+	}
+
+	return {
+		totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8),
+		totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8),
+		totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8),
+	};
 }
 
 function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {
 
@@ -410,6 +410,7 @@ export async function ggufAllShards(
 		fetch?: typeof fetch;
 		additionalFetchHeaders?: Record<string, string>;
 		parallelDownloads?: number;
+		allowLocalFile?: boolean;
 	}
 ): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {
 	const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;
 
@@ -125,3 +125,44 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
 		src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
 	},
 };
+
+const QK_K = 256;
+const calcBPW = (blockSize: number, typeSize: number) => {
+	return (typeSize * 8) / blockSize;
+};
+
+// copied from https://github.com/ggml-org/llama.cpp/tree/master/gguf-py/gguf/constants.py
+// map quantization type to element size in bits per weight (example: Q4_K -> 4.5 bpw)
+export const GGML_QUANT_SIZES = {
+	[GGMLQuantizationType.F32]: calcBPW(1, 4),
+	[GGMLQuantizationType.F16]: calcBPW(1, 2),
+	[GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
+	[GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
+	[GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
+	[GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
+	[GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
+	[GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
+	[GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
+	[GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
+	[GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
+	[GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
+	[GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
+	[GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
+	[GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
+	[GGMLQuantizationType.I8]: calcBPW(1, 1),
+	[GGMLQuantizationType.I16]: calcBPW(1, 2),
+	[GGMLQuantizationType.I32]: calcBPW(1, 4),
+	[GGMLQuantizationType.I64]: calcBPW(1, 8),
+	[GGMLQuantizationType.F64]: calcBPW(1, 8),
+	[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
+	[GGMLQuantizationType.BF16]: calcBPW(1, 2),
+	// [GGMLQuantizationType.TQ1_0]:   calcBPW(256, 2 + 4 * 13),
+	// [GGMLQuantizationType.TQ2_0]:   calcBPW(256, 2 + 64),
+};
@@ -1,7 +1,7 @@
 {
 	"name": "@huggingface/hub",
 	"packageManager": "[email protected]",
-	"version": "1.0.1",
+	"version": "1.0.2",
 	"description": "Utilities to interact with the Hugging Face hub",
 	"repository": "https://github.com/huggingface/huggingface.js.git",
 	"publishConfig": {
 
@@ -114,6 +114,47 @@ describe("downloadFileToCacheDir", () => {
 		expect(output).toBe(expectPointer);
 	});
 
+	test("existing symlinked and blob with default revision should not re-download it", async () => {
+		// <cache>/<repo>/<revision>/snapshots/README.md
+		const expectPointer = _getSnapshotFile({
+			repo: DUMMY_REPO,
+			path: "/README.md",
+			revision: "main",
+		});
+		// stat ensure a symlink and the pointed file exists
+		vi.mocked(stat).mockResolvedValue({} as Stats); // prevent default mocked reject
+		vi.mocked(lstat).mockResolvedValue({} as Stats);
+		vi.mocked(pathsInfo).mockResolvedValue([
+			{
+				oid: DUMMY_ETAG,
+				size: 55,
+				path: "README.md",
+				type: "file",
+				lastCommit: {
+					date: new Date(),
+					id: "main",
+					title: "Commit msg",
+				},
+			},
+		]);
+
+		const output = await downloadFileToCacheDir({
+			repo: DUMMY_REPO,
+			path: "/README.md",
+			fetch: fetchMock,
+		});
+
+		expect(stat).toHaveBeenCalledOnce();
+		expect(symlink).not.toHaveBeenCalledOnce();
+		// Get call argument for stat
+		const starArg = vi.mocked(stat).mock.calls[0][0];
+
+		expect(starArg).toBe(expectPointer);
+		expect(fetchMock).not.toHaveBeenCalledWith();
+
+		expect(output).toBe(expectPointer);
+	});
+
 	test("existing blob should only create the symlink", async () => {
 		// <cache>/<repo>/<revision>/snapshots/README.md
 		const expectPointer = _getSnapshotFile({
@@ -150,7 +191,6 @@ describe("downloadFileToCacheDir", () => {
 			fetch: fetchMock,
 		});
 
-		expect(stat).not.toHaveBeenCalled();
 		// should have check for the blob
 		expect(lstat).toHaveBeenCalled();
 		expect(vi.mocked(lstat).mock.calls[0][0]).toBe(expectedBlob);
 
@@ -96,6 +96,9 @@ export async function downloadFileToCacheDir(
 	const pointerPath = getFilePointer(storageFolder, commitHash ?? pathsInformation[0].lastCommit.id, params.path);
 	const blobPath = join(storageFolder, "blobs", etag);
 
+	// if we have the pointer file, we can shortcut the download
+	if (await exists(pointerPath, true)) return pointerPath;
+
 	// mkdir blob and pointer path parent directory
 	await mkdir(dirname(blobPath), { recursive: true });
 	await mkdir(dirname(pointerPath), { recursive: true });
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@huggingface/gguf",`
`3`	`3`	`"packageManager": "[email protected]",`
`4`		`- "version": "0.1.13",`
	`4`	`+ "version": "0.1.14",`
`5`	`5`	`"description": "a GGUF parser that works on remotely hosted files",`
`6`	`6`	`"repository": "https://github.com/huggingface/huggingface.js.git",`
`7`	`7`	`"publishConfig": {`
Original file line number	Diff line number	Diff line change
`@@ -410,6 +410,7 @@ export async function ggufAllShards(`
`410`	`410`	`fetch?: typeof fetch;`
`411`	`411`	`additionalFetchHeaders?: Record<string, string>;`
`412`	`412`	`parallelDownloads?: number;`
	`413`	`+ allowLocalFile?: boolean;`
`413`	`414`	`}`
`414`	`415`	`): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {`
`415`	`416`	`const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@huggingface/hub",`
`3`	`3`	`"packageManager": "[email protected]",`
`4`		`- "version": "1.0.1",`
	`4`	`+ "version": "1.0.2",`
`5`	`5`	`"description": "Utilities to interact with the Hugging Face hub",`
`6`	`6`	`"repository": "https://github.com/huggingface/huggingface.js.git",`
`7`	`7`	`"publishConfig": {`