Skip to content

feat(@huggingface/hub): adding scanCacheDir #908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions packages/hub/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ console.log(oauthResult);

Checkout the demo: https://huggingface.co/spaces/huggingfacejs/client-side-oauth

## Hugging face cache

The `@huggingface/hub` package provide basic capabilities to scan the cache directory. Learn more about [Manage huggingface_hub cache-system](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache).

```ts
import { scanCacheDir } from "@huggingface/hub";

const result = await scanCacheDir();

console.log(result);
```
Note that the cache directory is created and used only by the Python and Rust libraries. Downloading files using the `@huggingface/hub` package won't use the cache directory.

## Performance considerations

When uploading large files, you may want to run the `commit` calls inside a worker, to offload the sha256 computations.
Expand Down
1 change: 1 addition & 0 deletions packages/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"browser": {
"./src/utils/sha256-node.ts": false,
"./src/utils/FileBlob.ts": false,
"./src/lib/cache-management.ts": false,
"./dist/index.js": "./dist/browser/index.js",
"./dist/index.mjs": "./dist/browser/index.mjs"
},
Expand Down
137 changes: 137 additions & 0 deletions packages/hub/src/lib/cache-management.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import { describe, test, expect, vi, beforeEach } from "vitest";
import {
scanCacheDir,
scanCachedRepo,
scanSnapshotDir,
parseRepoType,
getBlobStat,
type CachedFileInfo,
} from "./cache-management";
import { stat, readdir, realpath, lstat } from "node:fs/promises";
import type { Dirent, Stats } from "node:fs";
import { join } from "node:path";

// Mocks
vi.mock("node:fs/promises");

beforeEach(() => {
vi.resetAllMocks();
vi.restoreAllMocks();
});

describe("scanCacheDir", () => {
test("should throw an error if cacheDir is not a directory", async () => {
vi.mocked(stat).mockResolvedValueOnce({
isDirectory: () => false,
} as Stats);

await expect(scanCacheDir("/fake/dir")).rejects.toThrow("Scan cache expects a directory");
});

test("empty directory should return an empty set of repository and no warnings", async () => {
vi.mocked(stat).mockResolvedValueOnce({
isDirectory: () => true,
} as Stats);

// mock empty cache folder
vi.mocked(readdir).mockResolvedValue([]);

const result = await scanCacheDir("/fake/dir");

// cacheDir must have been read
expect(readdir).toHaveBeenCalledWith("/fake/dir");

expect(result.warnings.length).toBe(0);
expect(result.repos).toHaveLength(0);
expect(result.size).toBe(0);
});
});

describe("scanCachedRepo", () => {
test("should throw an error for invalid repo path", async () => {
await expect(() => {
return scanCachedRepo("/fake/repo_path");
}).rejects.toThrow("Repo path is not a valid HuggingFace cache directory");
});

test("should throw an error if the snapshot folder does not exist", async () => {
vi.mocked(readdir).mockResolvedValue([]);
vi.mocked(stat).mockResolvedValue({
isDirectory: () => false,
} as Stats);

await expect(() => {
return scanCachedRepo("/fake/cacheDir/models--hello-world--name");
}).rejects.toThrow("Snapshots dir doesn't exist in cached repo");
});

test("should properly parse the repository name", async () => {
const repoPath = "/fake/cacheDir/models--hello-world--name";
vi.mocked(readdir).mockResolvedValue([]);
vi.mocked(stat).mockResolvedValue({
isDirectory: () => true,
} as Stats);

const result = await scanCachedRepo(repoPath);
expect(readdir).toHaveBeenCalledWith(join(repoPath, "refs"), {
withFileTypes: true,
});

expect(result.id.name).toBe("hello-world/name");
expect(result.id.type).toBe("model");
});
});

describe("scanSnapshotDir", () => {
test("should scan a valid snapshot directory", async () => {
const cachedFiles: CachedFileInfo[] = [];
const blobStats = new Map<string, Stats>();
vi.mocked(readdir).mockResolvedValueOnce([{ name: "file1", isDirectory: () => false } as Dirent]);

vi.mocked(realpath).mockResolvedValueOnce("/fake/realpath");
vi.mocked(lstat).mockResolvedValueOnce({ size: 1024, atimeMs: Date.now(), mtimeMs: Date.now() } as Stats);

await scanSnapshotDir("/fake/revision", cachedFiles, blobStats);

expect(cachedFiles).toHaveLength(1);
expect(blobStats.size).toBe(1);
});
});

describe("getBlobStat", () => {
test("should retrieve blob stat if already cached", async () => {
const blobStats = new Map<string, Stats>([["/fake/blob", { size: 1024 } as Stats]]);
const result = await getBlobStat("/fake/blob", blobStats);

expect(lstat).not.toHaveBeenCalled();
expect(result.size).toBe(1024);
});

test("should fetch and cache blob stat if not cached", async () => {
const blobStats = new Map();
vi.mocked(lstat).mockResolvedValueOnce({ size: 2048 } as Stats);

const result = await getBlobStat("/fake/blob", blobStats);

expect(result.size).toBe(2048);
expect(blobStats.size).toBe(1);
});
});

describe("parseRepoType", () => {
test("should parse models repo type", () => {
expect(parseRepoType("models")).toBe("model");
});

test("should parse dataset repo type", () => {
expect(parseRepoType("datasets")).toBe("dataset");
});

test("should parse space repo type", () => {
expect(parseRepoType("spaces")).toBe("space");
});

test("should throw an error for invalid repo type", () => {
expect(() => parseRepoType("invalid")).toThrowError("Invalid repo type: invalid");
});
});
Loading
Loading