Skip to content

Commit b6f6385

Browse files
python: List models on HF Hub (#68)
* rm __pycache__ * gitignore __pycache__ * gitignore dist * fix: use find_library instead of assuming library extension * Output all libraries in the root build dir * Add scripts * Install with cmake * python: list models * python: fix find_library * python: list repos and models in a repo --------- Co-authored-by: denis-ismailaj <[email protected]>
1 parent 8b667eb commit b6f6385

File tree

4 files changed

+123
-56
lines changed

4 files changed

+123
-56
lines changed

examples/python_bindings/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ All you need to do is to compile with the `-DBUILD_SHARED_LIBS=ON` option and co
2929
from clip_cpp import Clip
3030

3131
## you can either pass repo_id or .bin file
32-
## incase you pass repo_id and it has more than .bin file
33-
## it's recommended to spacify which file to download with `model_file`
32+
## you can type `clip-cpp-models` in your terminal to see what models are available for download
33+
## in case you pass repo_id and it has more than .bin file
34+
## it's recommended to specify which file to download with `model_file`
3435
repo_id = 'Green-Sky/ggml_laion_clip-vit-b-32-laion2b-s34b-b79k'
3536
model_file = 'laion_clip-vit-b-32-laion2b-s34b-b79k.ggmlv0.f16.bin'
3637

examples/python_bindings/clip_cpp/clip.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import ctypes
2-
from ctypes.util import find_library
32
import os
3+
import platform
44
from glob import glob
55
from pathlib import Path
66
from typing import List, Dict, Any, Optional
@@ -9,18 +9,28 @@
99

1010
# Note: Pass -DBUILD_SHARED_LIBS=ON to cmake to create the shared library file
1111

12+
13+
def find_library(name):
14+
os_name = platform.system()
15+
if os_name == "Linux":
16+
return f"./lib{name}.so"
17+
elif os_name == "Windows":
18+
return f"{name}.dll"
19+
elif os_name == "Mac":
20+
return f"lib{name}.dylib"
21+
22+
1223
cur_dir = os.getcwd()
1324
this_dir = os.path.abspath(os.path.dirname(__file__))
25+
os.chdir(this_dir)
1426

1527
# Load the shared library
1628
ggml_lib_path, clip_lib_path = find_library("ggml"), find_library("clip")
17-
if ggml_lib_path is None or clip_lib_path is None:
18-
raise RuntimeError("Could not find shared libraries. Please copy to the current working directory or supply the "
19-
"correct LD_LIBRARY_PATH/DYLD_LIBRARY_PATH.")
20-
2129
ggml_lib = ctypes.CDLL(ggml_lib_path)
2230
clip_lib = ctypes.CDLL(clip_lib_path)
2331

32+
os.chdir(cur_dir)
33+
2434

2535
# Define the ctypes structures
2636
class ClipTextHparams(ctypes.Structure):
@@ -108,8 +118,7 @@ class ClipContext(ctypes.Structure):
108118
clip_tokenize.restype = ClipTokens
109119

110120
clip_image_load_from_file = clip_lib.clip_image_load_from_file
111-
clip_image_load_from_file.argtypes = [
112-
ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
121+
clip_image_load_from_file.argtypes = [ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
113122
clip_image_load_from_file.restype = ctypes.c_bool
114123

115124
clip_image_preprocess = clip_lib.clip_image_preprocess
@@ -191,21 +200,22 @@ def _struct_to_dict(struct):
191200

192201
class Clip:
193202
def __init__(
194-
self,
195-
model_path_or_repo_id: str,
196-
model_file: Optional[str] = None,
197-
verbosity: int = 0):
203+
self,
204+
model_path_or_repo_id: str,
205+
model_file: Optional[str] = None,
206+
verbosity: int = 0,
207+
):
198208
"""
199209
Loads the language model from a local file or remote repo.
200210
201211
Args:
202212
---
203213
:param model_path_or_repo_id: str
204-
The path to a model file
214+
The path to a model file
205215
or the name of a Hugging Face model repo.
206216
207217
:param model_file: str | None
208-
The name of the model file in Hugging Face repo,
218+
The name of the model file in Hugging Face repo,
209219
if not specified the first bin file from the repo is choosen.
210220
211221
:param verbosity: int { 0, 1, 2 } Default = 0
@@ -239,7 +249,6 @@ def _find_model_path_from_repo(
239249
repo_id: str,
240250
filename: Optional[str] = None,
241251
) -> str:
242-
243252
repo_info = model_info(
244253
repo_id=repo_id,
245254
files_metadata=True,
@@ -255,10 +264,7 @@ def _find_model_path_from_repo(
255264
return cls._find_model_path_from_dir(path, filename=filename)
256265

257266
@classmethod
258-
def _find_model_file_from_repo(
259-
cls,
260-
repo_info: ModelInfo
261-
) -> Optional[str]:
267+
def _find_model_file_from_repo(cls, repo_info: ModelInfo) -> Optional[str]:
262268
"""return the smallest ggml file"""
263269
files = [
264270
(f.size, f.rfilename)
@@ -273,13 +279,11 @@ def _find_model_path_from_dir(
273279
path: str,
274280
filename: Optional[str] = None,
275281
) -> str:
276-
277282
path = Path(path).resolve()
278283
if filename:
279284
file = path.joinpath(filename).resolve()
280285
if not file.is_file():
281-
raise ValueError(
282-
f"Model file '{filename}' not found in '{path}'")
286+
raise ValueError(f"Model file '{filename}' not found in '{path}'")
283287
return str(file)
284288
files = glob(path.joinpath("*.bin")) # TODO add ".gguf"
285289
file = min(files, key=lambda x: x[0])[1]
@@ -304,17 +308,16 @@ def encode_text(
304308
normalize: bool = True,
305309
) -> List[float]:
306310
"""
307-
Takes Text Converted Tokens and generate the corresponding embeddings.
311+
Takes Text Converted Tokens and generate the corresponding embeddings.
308312
"""
309-
313+
310314
tokens_array = (ClipVocabId * len(tokens))(*tokens)
311315
clip_tokens = ClipTokens(data=tokens_array, size=len(tokens))
312316

313317
txt_vec = (ctypes.c_float * self.vec_dim)()
314318

315319
if not clip_text_encode(
316-
self.ctx, n_threads, ctypes.pointer(
317-
clip_tokens), txt_vec, normalize
320+
self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec, normalize
318321
):
319322
raise RuntimeError("Could not encode text")
320323

@@ -360,8 +363,7 @@ def compare_text_and_image(
360363

361364
score = ctypes.c_float()
362365
if not clip_compare_text_and_image(
363-
self.ctx, n_threads, text.encode(
364-
"utf8"), image_ptr, ctypes.pointer(score)
366+
self.ctx, n_threads, text.encode("utf8"), image_ptr, ctypes.pointer(score)
365367
):
366368
raise RuntimeError("Could not compare text and image")
367369

examples/python_bindings/clip_cpp/file_download.py

Lines changed: 87 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,29 @@
77
from .exceptions import RepositoryNotFoundError, RepositoryFileNameNotFound
88

99

10+
class Model:
11+
def __init__(
12+
self,
13+
_id: str,
14+
id: str,
15+
likes: int,
16+
private: bool,
17+
downloads: int,
18+
tags: List[str],
19+
modelId: str,
20+
):
21+
self._id = _id
22+
self.id = id
23+
self.likes = likes
24+
self.private = private
25+
self.downloads = downloads
26+
self.tags = tags
27+
self.modelId = modelId
28+
29+
def __str__(self):
30+
return f"repo_id: {self.modelId}"
31+
32+
1033
class BlobLfsInfo(TypedDict, total=False):
1134
size: int
1235
sha256: str
@@ -33,7 +56,6 @@ def __init__(
3356

3457

3558
class ModelInfo:
36-
3759
def __init__(
3860
self,
3961
*,
@@ -49,14 +71,14 @@ def __init__(
4971
securityStatus: Optional[Dict] = None,
5072
**kwargs,
5173
):
52-
5374
self.modelId = modelId
5475
self.sha = sha
5576
self.lastModified = lastModified
5677
self.tags = tags
5778
self.pipeline_tag = pipeline_tag
58-
self.siblings = [RepoFile(**x)
59-
for x in siblings] if siblings is not None else []
79+
self.siblings = (
80+
[RepoFile(**x) for x in siblings] if siblings is not None else []
81+
)
6082
self.private = private
6183
self.author = author
6284
self.config = config
@@ -75,9 +97,9 @@ def model_info(
7597
repo_id: str,
7698
files_metadata: bool = False,
7799
) -> ModelInfo:
78-
79-
get_files_metadata = urllib.parse.urlencode(
80-
{'blobs': files_metadata}) if files_metadata else ""
100+
get_files_metadata = (
101+
urllib.parse.urlencode({"blobs": files_metadata}) if files_metadata else ""
102+
)
81103
url = f"https://huggingface.co/api/models/{repo_id}/?" + get_files_metadata
82104
try:
83105
response = urllib.request.urlopen(url)
@@ -86,7 +108,7 @@ def model_info(
86108
except urllib.error.HTTPError as e:
87109
if e.code == 401:
88110
raise RepositoryNotFoundError
89-
elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
111+
elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
90112
raise RepositoryFileNameNotFound
91113
else:
92114
print(f"\nError getting Info about the repo_id: {e}")
@@ -95,54 +117,93 @@ def model_info(
95117
print(f"\nError getting Info about the repo_id: {e}")
96118

97119

98-
def model_download(
99-
repo_id: str,
100-
file_name: Union[str, List[str]]
101-
) -> str:
102-
''' Download HF model and returns the Path of the Downloaded file'''
120+
def model_download(repo_id: str, file_name: Union[str, List[str]]) -> str:
121+
"""Download HF model and returns the Path of the Downloaded file"""
103122

104-
# create a model dirictory to save the files neatly in one folder
105-
models_dir = Path('./models/')
123+
# create a model directory to save the files neatly in one folder
124+
models_dir = Path("./models/")
106125
models_dir.mkdir(exist_ok=True)
107126
destination_path = models_dir.joinpath(file_name)
108127

109-
url = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
128+
url = f"https://huggingface.co/{repo_id}/resolve/main/{file_name}"
110129

111130
def reporthook(count, block_size, total_size):
112131
# Calculate the progress
113132
downloaded_chunk = count * block_size
114133
progress = (downloaded_chunk / total_size) * 100
115134

116135
# print(downloaded_chunk // total_size)
117-
bar = ''.join(['=' if i <= progress/2 else ' ' for i in range(50)])
136+
bar = "".join(["=" if i <= progress / 2 else " " for i in range(50)])
118137
sys.stdout.write(
119-
f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)")
138+
f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)"
139+
)
120140
sys.stdout.flush()
121141

122142
try:
123143
print(f"[File Info] {destination_path}")
124144
# check if the file exists and matches the size of that in the network
125-
network_file_size = (urllib.request
126-
.urlopen(url)
127-
.info().get('Content-Length', 0))
128-
if destination_path.is_file() \
129-
and destination_path.stat().st_size == int(network_file_size):
145+
network_file_size = urllib.request.urlopen(url).info().get("Content-Length", 0)
146+
if destination_path.is_file() and destination_path.stat().st_size == int(
147+
network_file_size
148+
):
130149
# raise FileNameAlreadyExists
131150
return models_dir
132151

133-
urllib.request.urlretrieve(
134-
url, destination_path, reporthook=reporthook)
152+
urllib.request.urlretrieve(url, destination_path, reporthook=reporthook)
135153
sys.stdout.write("\n")
136154
print(f"File downloaded to {destination_path}")
137155
return models_dir
138156

139157
except urllib.error.HTTPError as e:
140158
if e.code == 401:
141159
raise RepositoryNotFoundError
142-
elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
160+
elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
143161
raise RepositoryFileNameNotFound
144162
else:
145163
print(f"\nError downloading file: {e}")
146164

147165
except Exception as e:
148166
print(f"\nError downloading file: {e}")
167+
168+
169+
def get_models() -> List[Model]:
170+
url = f"https://huggingface.co/api/models?filter=clip.cpp"
171+
try:
172+
response = urllib.request.urlopen(url)
173+
data = json.load(response)
174+
return [Model(**item) for item in data]
175+
except urllib.error.HTTPError as e:
176+
print(f"\nError listing available models: {e}")
177+
178+
except Exception as e:
179+
print(f"\nError listing available models: {e}")
180+
181+
182+
def available_models():
183+
if len(sys.argv) > 1 and sys.argv[-1] != "clip-cpp-models":
184+
repo_id = sys.argv[-1]
185+
repo = model_info(repo_id=repo_id, files_metadata=True)
186+
print(f"Available models in repo {repo_id}:")
187+
for model in repo.siblings:
188+
if model.rfilename.endswith(".bin"):
189+
name = model.rfilename
190+
size = model.size / (1024 * 1024)
191+
print(f" model: {name} ({size:.2f} MB)")
192+
193+
return
194+
195+
models = get_models()
196+
print(
197+
"Below are available models on HuggingFace Hub that you can use with the clip-cpp package.\n"
198+
)
199+
print("Available models:")
200+
for model in models:
201+
print(model)
202+
203+
print(
204+
"\nYou can pass one of the repo IDs above directly to `Clip()`, and it will download the smallest model in that repo automatically."
205+
)
206+
print(
207+
"Alternatively, you can choose which to load from that repo by passing a value to `model_file` argument."
208+
)
209+
print("\nTo see model files in a repo, type `clip-cpp-models <repo_id>`")

examples/python_bindings/pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "clip_cpp"
3-
version = "0.4.0"
3+
version = "0.4.1"
44
description = "CLIP inference with no big dependencies as PyTorch, TensorFlow, Numpy"
55
authors = ["Yusuf Sarıgöz <[email protected]>"]
66
packages = [{ include = "clip_cpp" }]
@@ -25,3 +25,6 @@ pytest = "^5.2"
2525
[build-system]
2626
requires = ["poetry-core>=1.7.0"]
2727
build-backend = "poetry.core.masonry.api"
28+
29+
[tool.poetry.scripts]
30+
clip-cpp-models = 'clip_cpp.file_download:available_models'

0 commit comments

Comments
 (0)