Skip to content

python: List models on HF Hub #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/python_bindings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ All you need to do is to compile with the `-DBUILD_SHARED_LIBS=ON` option and co
from clip_cpp import Clip

## you can either pass repo_id or .bin file
## incase you pass repo_id and it has more than .bin file
## it's recommended to spacify which file to download with `model_file`
## you can type `clip-cpp-models` in your terminal to see what models are available for download
## in case you pass repo_id and it has more than .bin file
## it's recommended to specify which file to download with `model_file`
repo_id = 'Green-Sky/ggml_laion_clip-vit-b-32-laion2b-s34b-b79k'
model_file = 'laion_clip-vit-b-32-laion2b-s34b-b79k.ggmlv0.f16.bin'

Expand Down
56 changes: 29 additions & 27 deletions examples/python_bindings/clip_cpp/clip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import ctypes
from ctypes.util import find_library
import os
import platform
from glob import glob
from pathlib import Path
from typing import List, Dict, Any, Optional
Expand All @@ -9,18 +9,28 @@

# Note: Pass -DBUILD_SHARED_LIBS=ON to cmake to create the shared library file


def find_library(name):
os_name = platform.system()
if os_name == "Linux":
return f"./lib{name}.so"
elif os_name == "Windows":
return f"{name}.dll"
elif os_name == "Mac":
return f"lib{name}.dylib"


cur_dir = os.getcwd()
this_dir = os.path.abspath(os.path.dirname(__file__))
os.chdir(this_dir)

# Load the shared library
ggml_lib_path, clip_lib_path = find_library("ggml"), find_library("clip")
if ggml_lib_path is None or clip_lib_path is None:
raise RuntimeError("Could not find shared libraries. Please copy to the current working directory or supply the "
"correct LD_LIBRARY_PATH/DYLD_LIBRARY_PATH.")

ggml_lib = ctypes.CDLL(ggml_lib_path)
clip_lib = ctypes.CDLL(clip_lib_path)

os.chdir(cur_dir)


# Define the ctypes structures
class ClipTextHparams(ctypes.Structure):
Expand Down Expand Up @@ -108,8 +118,7 @@ class ClipContext(ctypes.Structure):
clip_tokenize.restype = ClipTokens

clip_image_load_from_file = clip_lib.clip_image_load_from_file
clip_image_load_from_file.argtypes = [
ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
clip_image_load_from_file.argtypes = [ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
clip_image_load_from_file.restype = ctypes.c_bool

clip_image_preprocess = clip_lib.clip_image_preprocess
Expand Down Expand Up @@ -191,21 +200,22 @@ def _struct_to_dict(struct):

class Clip:
def __init__(
self,
model_path_or_repo_id: str,
model_file: Optional[str] = None,
verbosity: int = 0):
self,
model_path_or_repo_id: str,
model_file: Optional[str] = None,
verbosity: int = 0,
):
"""
Loads the language model from a local file or remote repo.

Args:
---
:param model_path_or_repo_id: str
The path to a model file
The path to a model file
or the name of a Hugging Face model repo.

:param model_file: str | None
The name of the model file in Hugging Face repo,
The name of the model file in Hugging Face repo,
if not specified the first bin file from the repo is choosen.

:param verbosity: int { 0, 1, 2 } Default = 0
Expand Down Expand Up @@ -239,7 +249,6 @@ def _find_model_path_from_repo(
repo_id: str,
filename: Optional[str] = None,
) -> str:

repo_info = model_info(
repo_id=repo_id,
files_metadata=True,
Expand All @@ -255,10 +264,7 @@ def _find_model_path_from_repo(
return cls._find_model_path_from_dir(path, filename=filename)

@classmethod
def _find_model_file_from_repo(
cls,
repo_info: ModelInfo
) -> Optional[str]:
def _find_model_file_from_repo(cls, repo_info: ModelInfo) -> Optional[str]:
"""return the smallest ggml file"""
files = [
(f.size, f.rfilename)
Expand All @@ -273,13 +279,11 @@ def _find_model_path_from_dir(
path: str,
filename: Optional[str] = None,
) -> str:

path = Path(path).resolve()
if filename:
file = path.joinpath(filename).resolve()
if not file.is_file():
raise ValueError(
f"Model file '{filename}' not found in '{path}'")
raise ValueError(f"Model file '{filename}' not found in '{path}'")
return str(file)
files = glob(path.joinpath("*.bin")) # TODO add ".gguf"
file = min(files, key=lambda x: x[0])[1]
Expand All @@ -304,17 +308,16 @@ def encode_text(
normalize: bool = True,
) -> List[float]:
"""
Takes Text Converted Tokens and generate the corresponding embeddings.
Takes Text Converted Tokens and generate the corresponding embeddings.
"""

tokens_array = (ClipVocabId * len(tokens))(*tokens)
clip_tokens = ClipTokens(data=tokens_array, size=len(tokens))

txt_vec = (ctypes.c_float * self.vec_dim)()

if not clip_text_encode(
self.ctx, n_threads, ctypes.pointer(
clip_tokens), txt_vec, normalize
self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec, normalize
):
raise RuntimeError("Could not encode text")

Expand Down Expand Up @@ -360,8 +363,7 @@ def compare_text_and_image(

score = ctypes.c_float()
if not clip_compare_text_and_image(
self.ctx, n_threads, text.encode(
"utf8"), image_ptr, ctypes.pointer(score)
self.ctx, n_threads, text.encode("utf8"), image_ptr, ctypes.pointer(score)
):
raise RuntimeError("Could not compare text and image")

Expand Down
113 changes: 87 additions & 26 deletions examples/python_bindings/clip_cpp/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@
from .exceptions import RepositoryNotFoundError, RepositoryFileNameNotFound


class Model:
def __init__(
self,
_id: str,
id: str,
likes: int,
private: bool,
downloads: int,
tags: List[str],
modelId: str,
):
self._id = _id
self.id = id
self.likes = likes
self.private = private
self.downloads = downloads
self.tags = tags
self.modelId = modelId

def __str__(self):
return f"repo_id: {self.modelId}"


class BlobLfsInfo(TypedDict, total=False):
size: int
sha256: str
Expand All @@ -33,7 +56,6 @@ def __init__(


class ModelInfo:

def __init__(
self,
*,
Expand All @@ -49,14 +71,14 @@ def __init__(
securityStatus: Optional[Dict] = None,
**kwargs,
):

self.modelId = modelId
self.sha = sha
self.lastModified = lastModified
self.tags = tags
self.pipeline_tag = pipeline_tag
self.siblings = [RepoFile(**x)
for x in siblings] if siblings is not None else []
self.siblings = (
[RepoFile(**x) for x in siblings] if siblings is not None else []
)
self.private = private
self.author = author
self.config = config
Expand All @@ -75,9 +97,9 @@ def model_info(
repo_id: str,
files_metadata: bool = False,
) -> ModelInfo:

get_files_metadata = urllib.parse.urlencode(
{'blobs': files_metadata}) if files_metadata else ""
get_files_metadata = (
urllib.parse.urlencode({"blobs": files_metadata}) if files_metadata else ""
)
url = f"https://huggingface.co/api/models/{repo_id}/?" + get_files_metadata
try:
response = urllib.request.urlopen(url)
Expand All @@ -86,7 +108,7 @@ def model_info(
except urllib.error.HTTPError as e:
if e.code == 401:
raise RepositoryNotFoundError
elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
raise RepositoryFileNameNotFound
else:
print(f"\nError getting Info about the repo_id: {e}")
Expand All @@ -95,54 +117,93 @@ def model_info(
print(f"\nError getting Info about the repo_id: {e}")


def model_download(
repo_id: str,
file_name: Union[str, List[str]]
) -> str:
''' Download HF model and returns the Path of the Downloaded file'''
def model_download(repo_id: str, file_name: Union[str, List[str]]) -> str:
"""Download HF model and returns the Path of the Downloaded file"""

# create a model dirictory to save the files neatly in one folder
models_dir = Path('./models/')
# create a model directory to save the files neatly in one folder
models_dir = Path("./models/")
models_dir.mkdir(exist_ok=True)
destination_path = models_dir.joinpath(file_name)

url = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
url = f"https://huggingface.co/{repo_id}/resolve/main/{file_name}"

def reporthook(count, block_size, total_size):
# Calculate the progress
downloaded_chunk = count * block_size
progress = (downloaded_chunk / total_size) * 100

# print(downloaded_chunk // total_size)
bar = ''.join(['=' if i <= progress/2 else ' ' for i in range(50)])
bar = "".join(["=" if i <= progress / 2 else " " for i in range(50)])
sys.stdout.write(
f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)")
f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)"
)
sys.stdout.flush()

try:
print(f"[File Info] {destination_path}")
# check if the file exists and matches the size of that in the network
network_file_size = (urllib.request
.urlopen(url)
.info().get('Content-Length', 0))
if destination_path.is_file() \
and destination_path.stat().st_size == int(network_file_size):
network_file_size = urllib.request.urlopen(url).info().get("Content-Length", 0)
if destination_path.is_file() and destination_path.stat().st_size == int(
network_file_size
):
# raise FileNameAlreadyExists
return models_dir

urllib.request.urlretrieve(
url, destination_path, reporthook=reporthook)
urllib.request.urlretrieve(url, destination_path, reporthook=reporthook)
sys.stdout.write("\n")
print(f"File downloaded to {destination_path}")
return models_dir

except urllib.error.HTTPError as e:
if e.code == 401:
raise RepositoryNotFoundError
elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
raise RepositoryFileNameNotFound
else:
print(f"\nError downloading file: {e}")

except Exception as e:
print(f"\nError downloading file: {e}")


def get_models() -> List[Model]:
url = f"https://huggingface.co/api/models?filter=clip.cpp"
try:
response = urllib.request.urlopen(url)
data = json.load(response)
return [Model(**item) for item in data]
except urllib.error.HTTPError as e:
print(f"\nError listing available models: {e}")

except Exception as e:
print(f"\nError listing available models: {e}")


def available_models():
if len(sys.argv) > 1 and sys.argv[-1] != "clip-cpp-models":
repo_id = sys.argv[-1]
repo = model_info(repo_id=repo_id, files_metadata=True)
print(f"Available models in repo {repo_id}:")
for model in repo.siblings:
if model.rfilename.endswith(".bin"):
name = model.rfilename
size = model.size / (1024 * 1024)
print(f" model: {name} ({size:.2f} MB)")

return

models = get_models()
print(
"Below are available models on HuggingFace Hub that you can use with the clip-cpp package.\n"
)
print("Available models:")
for model in models:
print(model)

print(
"\nYou can pass one of the repo IDs above directly to `Clip()`, and it will download the smallest model in that repo automatically."
)
print(
"Alternatively, you can choose which to load from that repo by passing a value to `model_file` argument."
)
print("\nTo see model files in a repo, type `clip-cpp-models <repo_id>`")
5 changes: 4 additions & 1 deletion examples/python_bindings/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "clip_cpp"
version = "0.4.0"
version = "0.4.1"
description = "CLIP inference with no big dependencies as PyTorch, TensorFlow, Numpy"
authors = ["Yusuf Sarıgöz <[email protected]>"]
packages = [{ include = "clip_cpp" }]
Expand All @@ -25,3 +25,6 @@ pytest = "^5.2"
[build-system]
requires = ["poetry-core>=1.7.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
clip-cpp-models = 'clip_cpp.file_download:available_models'