huggingface · percevalw · May 16, 2024
diff --git a/.github/workflows/python-api-edsnlp-cd.yaml b/.github/workflows/python-api-edsnlp-cd.yaml
@@ -0,0 +1,48 @@
+name: edsnlp-docker-cd
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docker_images/edsnlp/**"
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.8"
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install awscli
+      - uses: tailscale/github-action@v1
+        with:
+          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+      - name: Update upstream
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
+          DEFAULT_HOSTNAME: ${{ secrets.DEFAULT_HOSTNAME }}
+          REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
+          REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
+        run: |
+          python build_docker.py edsnlp --out out.txt
+      - name: Deploy on API
+        run: |
+          # Load the tags into the env
+          cat out.txt >> $GITHUB_ENV
+          export $(xargs < out.txt)
+          echo ${EDSNLP_CPU_TAG}
+          # Weird single quote escape mechanism because string interpolation does
+          # not work on single quote in bash
+          curl  -H "Authorization: Bearer  ${{ secrets.API_GITHUB_TOKEN }}"   https://api.github.com/repos/huggingface/api-inference/actions/workflows/update_community.yaml/dispatches   -d '{"ref":"main","inputs":{"framework":"EDSNLP","tag": "'"${EDSNLP_CPU_TAG}"'"}}'
+
diff --git a/.github/workflows/python-api-edsnlp.yaml b/.github/workflows/python-api-edsnlp.yaml
@@ -0,0 +1,26 @@
+name: edsnlp-docker
+
+on:
+  pull_request:
+    paths:
+      - "docker_images/edsnlp/**"
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.8"
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install pytest pillow httpx
+          pip install -e .
+      - run: RUN_DOCKER_TESTS=1 pytest -sv tests/test_dockers.py::DockerImageTests::test_edsnlp
diff --git a/docker_images/edsnlp/Dockerfile b/docker_images/edsnlp/Dockerfile
@@ -0,0 +1,30 @@
+FROM tiangolo/uvicorn-gunicorn:python3.8
+LABEL maintainer="Omar Sanseviero [email protected]"
+
+# Add any system dependency here
+# RUN apt-get update -y && apt-get install libXXX -y
+
+COPY ./requirements.txt /app
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ./prestart.sh /app/
+
+
+# Most DL models are quite large in terms of memory, using workers is a HUGE
+# slowdown because of the fork and GIL with python.
+# Using multiple pods seems like a better default strategy.
+# Feel free to override if it does not make sense for your library.
+ARG max_workers=1
+ENV MAX_WORKERS=$max_workers
+ENV HUGGINGFACE_HUB_CACHE=/data
+ENV PIP_CACHE=/data
+
+# Necessary on GPU environment docker.
+# TIMEOUT env variable is used by nvcr.io/nvidia/pytorch:xx for another purpose
+# rendering TIMEOUT defined by uvicorn impossible to use correctly
+# We're overriding it to be renamed UVICORN_TIMEOUT
+# UVICORN_TIMEOUT is a useful variable for very large models that take more
+# than 30s (the default) to load in memory.
+# If UVICORN_TIMEOUT is too low, uvicorn will simply never loads as it will
+# kill workers all the time before they finish.
+RUN sed -i 's/TIMEOUT/UVICORN_TIMEOUT/g' /gunicorn_conf.py
+COPY ./app /app/app
diff --git a/docker_images/edsnlp/app/__init__.py b/docker_images/edsnlp/app/__init__.py
diff --git a/docker_images/edsnlp/app/main.py b/docker_images/edsnlp/app/main.py
@@ -0,0 +1,95 @@
+import functools
+import logging
+import os
+from typing import Dict, Type
+
+from api_inference_community.routes import pipeline_route, status_ok
+from app.pipelines import (
+    Pipeline,
+    TokenClassificationPipeline,
+)
+from starlette.applications import Starlette
+from starlette.middleware import Middleware
+from starlette.middleware.gzip import GZipMiddleware
+from starlette.routing import Route
+
+
+TASK = os.getenv("TASK")
+MODEL_ID = os.getenv("MODEL_ID")
+
+
+logger = logging.getLogger(__name__)
+
+
+# Add the allowed tasks
+# Supported tasks are:
+# - text-generation
+# - text-classification
+# - token-classification
+# - translation
+# - summarization
+# - automatic-speech-recognition
+# - sentence-similarity
+# - ...
+# For instance
+# from app.pipelines import AutomaticSpeechRecognitionPipeline
+# ALLOWED_TASKS = {"automatic-speech-recognition": AutomaticSpeechRecognitionPipeline}
+# You can check the requirements and expectations of each pipelines in their respective
+# directories. Implement directly within the directories.
+ALLOWED_TASKS: Dict[str, Type[Pipeline]] = {
+    "token-classification": TokenClassificationPipeline,
+}
+
+
+@functools.lru_cache()
+def get_pipeline(task=None, model_id=None) -> Pipeline:
+    task = task or os.environ["TASK"]
+    model_id = model_id or os.environ["MODEL_ID"]
+    if task not in ALLOWED_TASKS:
+        raise EnvironmentError(f"{task} is not a valid pipeline for model : {model_id}")
+    return ALLOWED_TASKS[task](model_id)
+
+
+routes = [
+    Route("/{whatever:path}", status_ok),
+    Route("/{whatever:path}", pipeline_route, methods=["POST"]),
+]
+
+middleware = [Middleware(GZipMiddleware, minimum_size=1000)]
+if os.environ.get("DEBUG", "") == "1":
+    from starlette.middleware.cors import CORSMiddleware
+
+    middleware.append(
+        Middleware(
+            CORSMiddleware,
+            allow_origins=["*"],
+            allow_headers=["*"],
+            allow_methods=["*"],
+        )
+    )
+
+app = Starlette(routes=routes, middleware=middleware)
+
+
+@app.on_event("startup")
+async def startup_event():
+    logger = logging.getLogger("uvicorn.access")
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+    logger.handlers = [handler]
+
+    # Link between `api-inference-community` and framework code.
+    app.get_pipeline = get_pipeline
+    try:
+        get_pipeline()
+    except Exception:
+        # We can fail so we can show exception later.
+        pass
+
+
+if __name__ == "__main__":
+    try:
+        get_pipeline()
+    except Exception:
+        # We can fail so we can show exception later.
+        pass
diff --git a/docker_images/edsnlp/app/pipelines/__init__.py b/docker_images/edsnlp/app/pipelines/__init__.py
@@ -0,0 +1,3 @@
+from app.pipelines.base import Pipeline, PipelineException  # isort:skip
+
+from app.pipelines.token_classification import TokenClassificationPipeline
diff --git a/docker_images/edsnlp/app/pipelines/base.py b/docker_images/edsnlp/app/pipelines/base.py
@@ -0,0 +1,44 @@
+import datetime
+import re
+from abc import ABC, abstractmethod
+from typing import Any
+
+import edsnlp
+
+
+class Pipeline(ABC):
+    def __init__(self, model_id: str):
+        self.model = edsnlp.load(model_id, auto_update=True, install_dependencies=True)
+
+    @abstractmethod
+    def __call__(self, inputs: Any) -> Any:
+        raise NotImplementedError("Pipelines should implement a __call__ method")
+
+    def parse_inputs(self, text: str):
+        """
+        Parse text with the following format:
+        "Hello, my name is [John](PER) and I live in [New York](LOC)"
+        into a Doc object with entities.
+        """
+        new_text = ""
+        offset = 0
+        ents = []
+        for match in re.finditer(r"\[([^\]]*)\] *\(([^\)]*)\)", text):
+            new_text = new_text + text[offset : match.start(0)]
+            begin = len(new_text)
+            new_text = new_text + match.group(1)
+            end = len(new_text)
+            offset = match.end(0)
+            label = match.group(2)
+            ents.append({"start": begin, "end": end, "label": label or "ent"})
+        new_text = new_text + text[offset:]
+        doc = self.model.make_doc(new_text)
+        doc._.note_datetime = datetime.datetime.now()
+        doc.ents = [
+            doc.char_span(ent["start"], ent["end"], ent["label"]) for ent in ents
+        ]
+        return doc
+
+
+class PipelineException(Exception):
+    pass
diff --git a/docker_images/edsnlp/app/pipelines/token_classification.py b/docker_images/edsnlp/app/pipelines/token_classification.py
@@ -0,0 +1,36 @@
+from typing import Any, Dict, List
+
+from app.pipelines import Pipeline
+
+
+class TokenClassificationPipeline(Pipeline):
+    def __call__(self, inputs: str) -> List[Dict[str, Any]]:
+        """
+        Args:
+            inputs (:obj:`str`):
+                a string containing some text
+        Return:
+            A :obj:`list`:. The object returned should be like [{"entity_group": "XXX", "word": "some word", "start": 3, "end": 6, "score": 0.82}] containing :
+                - "entity_group": A string representing what the entity is.
+                - "word": A substring of the original string that was detected as an entity.
+                - "start": the offset within `input` leading to `answer`. context[start:stop] == word
+                - "end": the ending offset within `input` leading to `answer`. context[start:stop] === word
+                - "score": A score between 0 and 1 describing how confident the model is for this entity.
+                - "value": An optional value for the entity (date, amount, etc.)
+        """
+        doc = self.parse_inputs(inputs)
+        doc = self.model(doc)
+
+        entities = []
+        for ent in doc.ents:
+            current_entity = {
+                "entity_group": ent.label_,
+                "word": ent.text,
+                "start": ent.start_char,
+                "end": ent.end_char,
+                "score": 1.0,
+                "value": ent._.value,
+            }
+            entities.append(current_entity)
+
+        return entities
diff --git a/docker_images/edsnlp/prestart.sh b/docker_images/edsnlp/prestart.sh
@@ -0,0 +1 @@
+python app/main.py
diff --git a/docker_images/edsnlp/requirements.txt b/docker_images/edsnlp/requirements.txt
@@ -0,0 +1,5 @@
+starlette==0.27.0
+api-inference-community==0.0.32
+huggingface_hub==0.23.0
+edsnlp[ml]>=0.12.0
+requests==2.31.0
diff --git a/docker_images/edsnlp/tests/__init__.py b/docker_images/edsnlp/tests/__init__.py
diff --git a/docker_images/edsnlp/tests/test_api.py b/docker_images/edsnlp/tests/test_api.py
@@ -0,0 +1,46 @@
+import os
+from typing import Dict
+from unittest import TestCase, skipIf
+
+from app.main import ALLOWED_TASKS, get_pipeline
+
+
+# Must contain at least one example of each implemented pipeline
+# Tests do not check the actual values of the model output, so small dummy
+# models are recommended for faster tests.
+TESTABLE_MODELS: Dict[str, str] = {
+    #  IMPLEMENT_THIS
+    # "automatic-speech-recognition": "mysample-ASR",
+    # "text-generation": "mysample-gpt2",
+    "token-classification": "AP-HP/dummy-ner",
+}
+
+
+ALL_TASKS = {
+    "automatic-speech-recognition",
+    "audio-source-separation",
+    "feature-extraction",
+    "image-classification",
+    "question-answering",
+    "sentence-similarity",
+    "text-generation",
+    "text-to-speech",
+}
+
+
+class PipelineTestCase(TestCase):
+    @skipIf(
+        os.path.dirname(os.path.dirname(__file__)).endswith("common"),
+        "common is a special case",
+    )
+    def test_has_at_least_one_task_enabled(self):
+        self.assertGreater(
+            len(ALLOWED_TASKS.keys()), 0, "You need to implement at least one task"
+        )
+
+    def test_unsupported_tasks(self):
+        unsupported_tasks = ALL_TASKS - ALLOWED_TASKS.keys()
+        for unsupported_task in unsupported_tasks:
+            with self.subTest(msg=unsupported_task, task=unsupported_task):
+                with self.assertRaises(EnvironmentError):
+                    get_pipeline(unsupported_task, model_id="XX")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from app.pipelines.base import Pipeline, PipelineException # isort:skip

		from app.pipelines.token_classification import TokenClassificationPipeline