python: List models on HF Hub (#68)

monatis · denis-ismailaj · web-flow · commit b6f63855e9d0 · 2023-09-18T14:09:39.000+03:00
* rm __pycache__

* gitignore __pycache__

* gitignore dist

* fix: use find_library instead of assuming library extension

* Output all libraries in the root build dir

* Add scripts

* Install with cmake

* python: list models

* python: fix find_library

* python: list repos and models in a repo

---------

Co-authored-by: denis-ismailaj &lt;denis.ismailaj1@gmail.com&gt;
diff --git a/examples/python_bindings/README.md b/examples/python_bindings/README.md
@@ -29,8 +29,9 @@ All you need to do is to compile with the `-DBUILD_SHARED_LIBS=ON` option and co
 from clip_cpp import Clip
 
 ## you can either pass repo_id or .bin file
-## incase you pass repo_id and it has more than .bin file
-## it's recommended to spacify which file to download with `model_file`
+## you can type `clip-cpp-models` in your terminal to see what models are available for download
+## in case you pass repo_id and it has more than .bin file
+## it's recommended to specify which file to download with `model_file`
 repo_id = 'Green-Sky/ggml_laion_clip-vit-b-32-laion2b-s34b-b79k'
 model_file = 'laion_clip-vit-b-32-laion2b-s34b-b79k.ggmlv0.f16.bin'
 
diff --git a/examples/python_bindings/clip_cpp/clip.py b/examples/python_bindings/clip_cpp/clip.py
@@ -1,6 +1,6 @@
 import ctypes
-from ctypes.util import find_library
 import os
+import platform
 from glob import glob
 from pathlib import Path
 from typing import List, Dict, Any, Optional
@@ -9,18 +9,28 @@
 
 # Note: Pass -DBUILD_SHARED_LIBS=ON to cmake to create the shared library file
 
+
+def find_library(name):
+    os_name = platform.system()
+    if os_name == "Linux":
+        return f"./lib{name}.so"
+    elif os_name == "Windows":
+        return f"{name}.dll"
+    elif os_name == "Mac":
+        return f"lib{name}.dylib"
+
+
 cur_dir = os.getcwd()
 this_dir = os.path.abspath(os.path.dirname(__file__))
+os.chdir(this_dir)
 
 # Load the shared library
 ggml_lib_path, clip_lib_path = find_library("ggml"), find_library("clip")
-if ggml_lib_path is None or clip_lib_path is None:
-    raise RuntimeError("Could not find shared libraries. Please copy to the current working directory or supply the "
-                       "correct LD_LIBRARY_PATH/DYLD_LIBRARY_PATH.")
-
 ggml_lib = ctypes.CDLL(ggml_lib_path)
 clip_lib = ctypes.CDLL(clip_lib_path)
 
+os.chdir(cur_dir)
+
 
 # Define the ctypes structures
 class ClipTextHparams(ctypes.Structure):
@@ -108,8 +118,7 @@ class ClipContext(ctypes.Structure):
 clip_tokenize.restype = ClipTokens
 
 clip_image_load_from_file = clip_lib.clip_image_load_from_file
-clip_image_load_from_file.argtypes = [
-    ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
+clip_image_load_from_file.argtypes = [ctypes.c_char_p, ctypes.POINTER(ClipImageU8)]
 clip_image_load_from_file.restype = ctypes.c_bool
 
 clip_image_preprocess = clip_lib.clip_image_preprocess
@@ -191,21 +200,22 @@ def _struct_to_dict(struct):
 
 class Clip:
     def __init__(
-            self,
-            model_path_or_repo_id: str,
-            model_file: Optional[str] = None,
-            verbosity: int = 0):
+        self,
+        model_path_or_repo_id: str,
+        model_file: Optional[str] = None,
+        verbosity: int = 0,
+    ):
         """
         Loads the language model from a local file or remote repo.
 
         Args:
         ---
             :param model_path_or_repo_id: str
-                The path to a model file  
+                The path to a model file
                 or the name of a Hugging Face model repo.
 
             :param model_file: str | None
-              The name of the model file in Hugging Face repo, 
+              The name of the model file in Hugging Face repo,
               if not specified the first bin file from the repo is choosen.
 
             :param verbosity: int { 0, 1, 2 } Default = 0
@@ -239,7 +249,6 @@ def _find_model_path_from_repo(
         repo_id: str,
         filename: Optional[str] = None,
     ) -> str:
-
         repo_info = model_info(
             repo_id=repo_id,
             files_metadata=True,
@@ -255,10 +264,7 @@ def _find_model_path_from_repo(
         return cls._find_model_path_from_dir(path, filename=filename)
 
     @classmethod
-    def _find_model_file_from_repo(
-        cls,
-        repo_info: ModelInfo
-    ) -> Optional[str]:
+    def _find_model_file_from_repo(cls, repo_info: ModelInfo) -> Optional[str]:
         """return the smallest ggml file"""
         files = [
             (f.size, f.rfilename)
@@ -273,13 +279,11 @@ def _find_model_path_from_dir(
         path: str,
         filename: Optional[str] = None,
     ) -> str:
-
         path = Path(path).resolve()
         if filename:
             file = path.joinpath(filename).resolve()
             if not file.is_file():
-                raise ValueError(
-                    f"Model file '{filename}' not found in '{path}'")
+                raise ValueError(f"Model file '{filename}' not found in '{path}'")
             return str(file)
         files = glob(path.joinpath("*.bin"))  # TODO add ".gguf"
         file = min(files, key=lambda x: x[0])[1]
@@ -304,17 +308,16 @@ def encode_text(
         normalize: bool = True,
     ) -> List[float]:
         """
-        Takes Text Converted Tokens and generate the corresponding embeddings. 
+        Takes Text Converted Tokens and generate the corresponding embeddings.
         """
-        
+
         tokens_array = (ClipVocabId * len(tokens))(*tokens)
         clip_tokens = ClipTokens(data=tokens_array, size=len(tokens))
 
         txt_vec = (ctypes.c_float * self.vec_dim)()
 
         if not clip_text_encode(
-            self.ctx, n_threads, ctypes.pointer(
-                clip_tokens), txt_vec, normalize
+            self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec, normalize
         ):
             raise RuntimeError("Could not encode text")
 
@@ -360,8 +363,7 @@ def compare_text_and_image(
 
         score = ctypes.c_float()
         if not clip_compare_text_and_image(
-            self.ctx, n_threads, text.encode(
-                "utf8"), image_ptr, ctypes.pointer(score)
+            self.ctx, n_threads, text.encode("utf8"), image_ptr, ctypes.pointer(score)
         ):
             raise RuntimeError("Could not compare text and image")
 
diff --git a/examples/python_bindings/clip_cpp/file_download.py b/examples/python_bindings/clip_cpp/file_download.py
@@ -7,6 +7,29 @@
 from .exceptions import RepositoryNotFoundError, RepositoryFileNameNotFound
 
 
+class Model:
+    def __init__(
+        self,
+        _id: str,
+        id: str,
+        likes: int,
+        private: bool,
+        downloads: int,
+        tags: List[str],
+        modelId: str,
+    ):
+        self._id = _id
+        self.id = id
+        self.likes = likes
+        self.private = private
+        self.downloads = downloads
+        self.tags = tags
+        self.modelId = modelId
+
+    def __str__(self):
+        return f"repo_id: {self.modelId}"
+
+
 class BlobLfsInfo(TypedDict, total=False):
     size: int
     sha256: str
@@ -33,7 +56,6 @@ def __init__(
 
 
 class ModelInfo:
-
     def __init__(
         self,
         *,
@@ -49,14 +71,14 @@ def __init__(
         securityStatus: Optional[Dict] = None,
         **kwargs,
     ):
-
         self.modelId = modelId
         self.sha = sha
         self.lastModified = lastModified
         self.tags = tags
         self.pipeline_tag = pipeline_tag
-        self.siblings = [RepoFile(**x)
-                         for x in siblings] if siblings is not None else []
+        self.siblings = (
+            [RepoFile(**x) for x in siblings] if siblings is not None else []
+        )
         self.private = private
         self.author = author
         self.config = config
@@ -75,9 +97,9 @@ def model_info(
     repo_id: str,
     files_metadata: bool = False,
 ) -> ModelInfo:
-
-    get_files_metadata = urllib.parse.urlencode(
-        {'blobs': files_metadata}) if files_metadata else ""
+    get_files_metadata = (
+        urllib.parse.urlencode({"blobs": files_metadata}) if files_metadata else ""
+    )
     url = f"https://huggingface.co/api/models/{repo_id}/?" + get_files_metadata
     try:
         response = urllib.request.urlopen(url)
@@ -86,7 +108,7 @@ def model_info(
     except urllib.error.HTTPError as e:
         if e.code == 401:
             raise RepositoryNotFoundError
-        elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
+        elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
             raise RepositoryFileNameNotFound
         else:
             print(f"\nError getting Info about the repo_id: {e}")
@@ -95,54 +117,93 @@ def model_info(
         print(f"\nError getting Info about the repo_id: {e}")
 
 
-def model_download(
-    repo_id: str,
-    file_name: Union[str, List[str]]
-) -> str:
-    ''' Download HF model and returns the Path of the Downloaded file'''
+def model_download(repo_id: str, file_name: Union[str, List[str]]) -> str:
+    """Download HF model and returns the Path of the Downloaded file"""
 
-    # create a model dirictory to save the files neatly in one folder
-    models_dir = Path('./models/')
+    # create a model directory to save the files neatly in one folder
+    models_dir = Path("./models/")
     models_dir.mkdir(exist_ok=True)
     destination_path = models_dir.joinpath(file_name)
 
-    url = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
+    url = f"https://huggingface.co/{repo_id}/resolve/main/{file_name}"
 
     def reporthook(count, block_size, total_size):
         # Calculate the progress
         downloaded_chunk = count * block_size
         progress = (downloaded_chunk / total_size) * 100
 
         # print(downloaded_chunk // total_size)
-        bar = ''.join(['=' if i <= progress/2 else ' ' for i in range(50)])
+        bar = "".join(["=" if i <= progress / 2 else " " for i in range(50)])
         sys.stdout.write(
-            f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)")
+            f"\r[{bar}] {progress:.1f}% ({downloaded_chunk/1024**2:.2f} MB/{total_size/1024**2:.0f} MB)"
+        )
         sys.stdout.flush()
 
     try:
         print(f"[File Info] {destination_path}")
         # check if the file exists and matches the size of that in the network
-        network_file_size = (urllib.request
-                             .urlopen(url)
-                             .info().get('Content-Length', 0))
-        if destination_path.is_file() \
-                and destination_path.stat().st_size == int(network_file_size):
+        network_file_size = urllib.request.urlopen(url).info().get("Content-Length", 0)
+        if destination_path.is_file() and destination_path.stat().st_size == int(
+            network_file_size
+        ):
             # raise FileNameAlreadyExists
             return models_dir
 
-        urllib.request.urlretrieve(
-            url, destination_path, reporthook=reporthook)
+        urllib.request.urlretrieve(url, destination_path, reporthook=reporthook)
         sys.stdout.write("\n")
         print(f"File downloaded to {destination_path}")
         return models_dir
 
     except urllib.error.HTTPError as e:
         if e.code == 401:
             raise RepositoryNotFoundError
-        elif e.peek().decode('utf-8', errors='ignore') == "Entry not found":
+        elif e.peek().decode("utf-8", errors="ignore") == "Entry not found":
             raise RepositoryFileNameNotFound
         else:
             print(f"\nError downloading file: {e}")
 
     except Exception as e:
         print(f"\nError downloading file: {e}")
+
+
+def get_models() -> List[Model]:
+    url = f"https://huggingface.co/api/models?filter=clip.cpp"
+    try:
+        response = urllib.request.urlopen(url)
+        data = json.load(response)
+        return [Model(**item) for item in data]
+    except urllib.error.HTTPError as e:
+        print(f"\nError listing available models: {e}")
+
+    except Exception as e:
+        print(f"\nError listing available models: {e}")
+
+
+def available_models():
+    if len(sys.argv) > 1 and sys.argv[-1] != "clip-cpp-models":
+        repo_id = sys.argv[-1]
+        repo = model_info(repo_id=repo_id, files_metadata=True)
+        print(f"Available models in repo {repo_id}:")
+        for model in repo.siblings:
+            if model.rfilename.endswith(".bin"):
+                name = model.rfilename
+                size = model.size / (1024 * 1024)
+                print(f"    model: {name} ({size:.2f} MB)")
+
+        return
+
+    models = get_models()
+    print(
+        "Below are available models on HuggingFace Hub that you can use with the clip-cpp package.\n"
+    )
+    print("Available models:")
+    for model in models:
+        print(model)
+
+    print(
+        "\nYou can pass one of the repo IDs above directly to `Clip()`, and it will download the smallest model in that repo automatically."
+    )
+    print(
+        "Alternatively, you can choose which to load from that repo by passing a value to `model_file` argument."
+    )
+    print("\nTo see model files in a repo, type `clip-cpp-models <repo_id>`")
diff --git a/examples/python_bindings/pyproject.toml b/examples/python_bindings/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clip_cpp"
-version = "0.4.0"
+version = "0.4.1"
 description = "CLIP inference with no big dependencies as PyTorch, TensorFlow, Numpy"
 authors = ["Yusuf Sarıgöz <yusufsarigoz@gmail.com>"]
 packages = [{ include = "clip_cpp" }]
@@ -25,3 +25,6 @@ pytest = "^5.2"
 [build-system]
 requires = ["poetry-core>=1.7.0"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+clip-cpp-models = 'clip_cpp.file_download:available_models'