Merge branch 'abetlen:main' into main

Maximilian-Winter · web-flow · commit c6a965997278 · 2023-05-25T17:09:19.000+02:00
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,51 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+
+# Perform the conditional installations based on the image
+RUN echo "Image: ${IMAGE}" && \
+    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    echo "OpenBLAS install:" && \
+    apt-get install -y --no-install-recommends libopenblas-dev && \
+    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+else \
+    echo "CuBLAS install:" && \
+    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+fi
+
+# Clean up apt cache
+RUN rm -rf /var/lib/apt/lists/*
+
+# Set a working directory for better clarity
+WORKDIR /app
+
+# Copy files to the app directory
+RUN echo "Installing model...this can take some time..."
+COPY ./model.bin /app/model.bin
+COPY ./start_server.sh /app/start_server.sh
+
+# Make the server start script executable
+RUN chmod +x /app/start_server.sh
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/start_server.sh"]
diff --git a/docker/Dockerfile.cuda_simple b/docker/Dockerfile.cuda_simple
diff --git a/docker/Dockerfile.openblas_simple b/docker/Dockerfile.openblas_simple
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,46 @@
+# Dockerfiles for building the llama-cpp-python server
+- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
+- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
+- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
+- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
+ 
+# Get model from Hugging Face
+`python3 ./hug_model.py`
+
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
+```
+**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:
+
+| Model |  Quantized size |
+|------:|----------------:|
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   30B |           25 GB |
+|   65B |           50 GB |
+
+**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
+
+# Install Docker Server
+
+**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+
+[Install Docker Engine](https://docs.docker.com/engine/install)
+
+# Use OpenBLAS
+Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
+## Build:
+`docker build --build-arg -t openblas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t openblas`
+
+# Use CuBLAS
+Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+## Build:
+`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t cublas`
diff --git a/docker/hug_model.py b/docker/hug_model.py
@@ -0,0 +1,116 @@
+import requests
+import json
+import os
+import struct
+
+def make_request(url, params=None):
+    print(f"Making request to {url}...")
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        return json.loads(response.text)
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        return None
+
+def check_magic_and_version(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 6 bytes from the file
+        data = f.read(6)
+
+    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
+    # and the next 2 bytes as a little-endian unsigned short
+    magic, version = struct.unpack('<I H', data)
+
+    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
+
+    return magic, version
+
+def download_file(url, destination):
+    print(f"Downloading {url} to {destination}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(destination, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+        
+        # Creating a symbolic link from destination to "model.bin"
+        if os.path.isfile("model.bin"):
+            os.remove("model.bin")  # remove the existing link if any
+        os.symlink(destination, "model.bin")
+    else:
+        print(f"Download failed with status code {response.status_code}")
+
+def get_user_choice(model_list):
+    # Print the enumerated list
+    print("\n")
+    for i, (model_id, rfilename) in enumerate(model_list):
+        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
+
+    # Get user's choice
+    choice = input("Choose a model to download by entering the corresponding number: ")
+    try:
+        index = int(choice) - 1
+        if 0 <= index < len(model_list):
+            # Return the chosen model
+            return model_list[index]
+        else:
+            print("Invalid choice.")
+    except ValueError:
+        print("Invalid input. Please enter a number corresponding to a model.")
+    except IndexError:
+        print("Invalid choice. Index out of range.")
+    
+    return None
+
+import argparse
+
+def main():
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser.add_argument('-v', '--version', type=int, default=0x0003,
+                        help='an integer for the version to be used')
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Define the parameters
+    params = {
+        "author": "TheBloke",  # Filter by author
+        "tags": "llama"
+    }
+
+    models = make_request('https://huggingface.co/api/models', params=params)
+    if models is None:
+        return
+
+    model_list = []
+    # Iterate over the models
+    for model in models:
+        model_id = model['id']
+        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        if model_info is None:
+            continue
+
+        for sibling in model_info.get('siblings', []):
+            rfilename = sibling.get('rfilename')
+            if rfilename and 'q5_1' in rfilename:
+                model_list.append((model_id, rfilename))
+
+    model_choice = get_user_choice(model_list)
+    if model_choice is not None:
+        model_id, rfilename = model_choice
+        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
+        download_file(url, rfilename)
+        _, version = check_magic_and_version(rfilename)
+        if version != args.version:
+            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+
+if __name__ == '__main__':
+    main()
diff --git a/docker/start_server.sh b/docker/start_server.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+# For mmap support
+ulimit -l unlimited
+
+if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
+    python3 -B -m llama_cpp.server --model /app/model.bin
+else
+    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
+    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
+fi