Skip to content

Hub: Move hash-wasm into vendor (emscripten) #682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/hub/.eslintignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
dist
sha256.js
3 changes: 2 additions & 1 deletion packages/hub/.prettierignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pnpm-lock.yaml
# In order to avoid code samples to have tabs, they don't display well on npm
README.md
dist
dist
sha256.js
3 changes: 1 addition & 2 deletions packages/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
"@types/node": "^20.11.28"
},
"dependencies": {
"@huggingface/tasks": "workspace:^",
"hash-wasm": "^4.9.0"
"@huggingface/tasks": "workspace:^"
}
}
7 changes: 0 additions & 7 deletions packages/hub/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 50 additions & 0 deletions packages/hub/src/utils/sha256.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { describe, it, expect } from "vitest";
import { sha256 } from "./sha256";

const smallContent = "hello world";
const smallContentSHA256 = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9";
const bigContent = "O123456789".repeat(100_000);
const bigContentSHA256 = "a3bbce7ee1df7233d85b5f4d60faa3755f93f537804f8b540c72b0739239ddf8";
const biggerContent = "0123456789".repeat(1_000_000);
const biggerContentSHA256 = "d52fcc26b48dbd4d79b125eb0a29b803ade07613c67ac7c6f2751aefef008486";

describe("sha256", () => {
async function calcSHA256(content: string, useWebWorker: boolean) {
const iterator = sha256(new Blob([content]), { useWebWorker });
let res: IteratorResult<number, string>;
do {
res = await iterator.next();
} while (!res.done);
return res.value;
}

it("Calculate hash of a small file", async () => {
const sha = await calcSHA256(smallContent, false);
expect(sha).toBe(smallContentSHA256);
});

it("Calculate hash of a big file", async () => {
const sha = await calcSHA256(bigContent, false);
expect(sha).toBe(bigContentSHA256);
});

it("Calculate hash of a bigger file", async () => {
const sha = await calcSHA256(biggerContent, false);
expect(sha).toBe(biggerContentSHA256);
});

it("Calculate hash of a small file (+ web worker)", async () => {
const sha = await calcSHA256(smallContent, true);
expect(sha).toBe(smallContentSHA256);
});

it("Calculate hash of a big file (+ web worker)", async () => {
const sha = await calcSHA256(bigContent, true);
expect(sha).toBe(bigContentSHA256);
});

it("Calculate hash of a bigger file (+ web worker)", async () => {
const sha = await calcSHA256(biggerContent, true);
expect(sha).toBe(biggerContentSHA256);
});
});
37 changes: 8 additions & 29 deletions packages/hub/src/utils/sha256.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,10 @@ import { eventToGenerator } from "./eventToGenerator";
import { hexFromBytes } from "./hexFromBytes";
import { isFrontend } from "./isFrontend";

const webWorkerCode = `
// Would prefer no CDN, but need a clever way to not burden the main file of the bundle
importScripts("https://cdn.jsdelivr.net/npm/hash-wasm@4/dist/sha256.umd.min.js");

const createSHA256 = hashwasm.createSHA256;

self.addEventListener('message', async (event) => {
const { file } = event.data;
const sha256 = await createSHA256();
sha256.init();
const reader = file.stream().getReader();
const total = file.size;
let bytesDone = 0;
while (true) {
const { done, value } = await reader.read();
if (done) {
break;
}
sha256.update(value);
bytesDone += value.length;
postMessage({ progress: bytesDone / total });
}
postMessage({ sha256: sha256.digest('hex') });
});
`;
async function getWebWorkerCode() {
const sha256Module = await import("../vendor/hash-wasm/sha256-wrapper");
return URL.createObjectURL(new Blob([sha256Module.createSHA256WorkerCode()]));
}

const pendingWorkers: Worker[] = [];
const runningWorkers: Set<Worker> = new Set();
Expand All @@ -45,7 +24,7 @@ async function getWorker(poolSize?: number): Promise<Worker> {
}
}
if (!poolSize) {
const worker = new Worker(URL.createObjectURL(new Blob([webWorkerCode])));
const worker = new Worker(await getWebWorkerCode());
runningWorkers.add(worker);
return worker;
}
Expand All @@ -58,7 +37,7 @@ async function getWorker(poolSize?: number): Promise<Worker> {
await waitPromise;
}

const worker = new Worker(URL.createObjectURL(new Blob([webWorkerCode])));
const worker = new Worker(await getWebWorkerCode());
runningWorkers.add(worker);
return worker;
}
Expand Down Expand Up @@ -147,7 +126,7 @@ export async function* sha256(
}
}
if (!wasmModule) {
wasmModule = await import("hash-wasm");
wasmModule = await import("../vendor/hash-wasm/sha256-wrapper");
}

const sha256 = await wasmModule.createSHA256();
Expand Down Expand Up @@ -184,4 +163,4 @@ export async function* sha256(
// eslint-disable-next-line @typescript-eslint/consistent-type-imports
let cryptoModule: typeof import("./sha256-node");
// eslint-disable-next-line @typescript-eslint/consistent-type-imports
let wasmModule: typeof import("hash-wasm");
let wasmModule: typeof import("../vendor/hash-wasm/sha256-wrapper");
33 changes: 33 additions & 0 deletions packages/hub/src/vendor/hash-wasm/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

CURRENT_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $CURRENT_PATH

# Clean up
docker kill hash-wasm-builder
docker rm hash-wasm-builder

# Start container
docker run -it -d --name hash-wasm-builder emscripten/emsdk:3.1.55 bash

# Copy & compile
docker exec hash-wasm-builder bash -c "mkdir /source"
docker cp ./sha256.c hash-wasm-builder:/source
docker exec hash-wasm-builder bash -c "\
cd /source && \
emcc sha256.c -o sha256.js -msimd128 -sSINGLE_FILE -sMODULARIZE=1 -sENVIRONMENT=web,worker -sEXPORTED_FUNCTIONS=_Hash_Init,_Hash_Update,_Hash_Final,_GetBufferPtr -sFILESYSTEM=0 -fno-rtti -fno-exceptions -O1 -sMODULARIZE=1 -sEXPORT_ES6=1 \
"
# Patch "_scriptDir" variable
docker exec hash-wasm-builder bash -c "\
cd /source && \
sed -i 's\var _scriptDir\var _unused\g' ./sha256.js && \
sed -i 's\_scriptDir\false\g' ./sha256.js \
"

# Copy back compiled file
docker cp hash-wasm-builder:/source/sha256.js .


# Clean up
docker kill hash-wasm-builder
docker rm hash-wasm-builder
62 changes: 62 additions & 0 deletions packages/hub/src/vendor/hash-wasm/sha256-wrapper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import WasmModule from "./sha256";

export async function createSHA256(isInsideWorker = false): Promise<{
init(): void;
update(data: Uint8Array): void;
digest(method: "hex"): string;
}> {
const BUFFER_MAX_SIZE = 8 * 1024 * 1024;
const wasm: Awaited<ReturnType<typeof WasmModule>> = isInsideWorker
? // @ts-expect-error WasmModule will be populated inside self object
await self["SHA256WasmModule"]()
: await WasmModule();
const heap = wasm.HEAPU8.subarray(wasm._GetBufferPtr());
return {
init() {
wasm._Hash_Init(256);
},
update(data: Uint8Array) {
let byteUsed = 0;
while (byteUsed < data.byteLength) {
const bytesLeft = data.byteLength - byteUsed;
const length = Math.min(bytesLeft, BUFFER_MAX_SIZE);
heap.set(data.subarray(byteUsed, byteUsed + length));
wasm._Hash_Update(length);
byteUsed += length;
}
},
digest(method: "hex") {
if (method !== "hex") {
throw new Error("Only digest hex is supported");
}
wasm._Hash_Final();
const result = Array.from(heap.slice(0, 32));
return result.map((b) => b.toString(16).padStart(2, "0")).join("");
},
};
}

export function createSHA256WorkerCode(): string {
return `
self.addEventListener('message', async (event) => {
const { file } = event.data;
const sha256 = await self.createSHA256(true);
sha256.init();
const reader = file.stream().getReader();
const total = file.size;
let bytesDone = 0;
while (true) {
const { done, value } = await reader.read();
if (done) {
break;
}
sha256.update(value);
bytesDone += value.length;
postMessage({ progress: bytesDone / total });
}
postMessage({ sha256: sha256.digest('hex') });
});
self.SHA256WasmModule = ${WasmModule.toString()};
self.createSHA256 = ${createSHA256.toString()};
`;
}
Loading
Loading