Skip to content

Adding support for edsnlp library #424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions .github/workflows/python-api-edsnlp-cd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: edsnlp-docker-cd
on:
push:
branches:
- main
paths:
- "docker_images/edsnlp/**"
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: Checkout
uses: actions/checkout@v2
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Install dependencies
run: |
pip install --upgrade pip
pip install awscli
- uses: tailscale/github-action@v1
with:
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
- name: Update upstream
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
DEFAULT_HOSTNAME: ${{ secrets.DEFAULT_HOSTNAME }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
run: |
python build_docker.py edsnlp --out out.txt
- name: Deploy on API
run: |
# Load the tags into the env
cat out.txt >> $GITHUB_ENV
export $(xargs < out.txt)
echo ${EDSNLP_CPU_TAG}
# Weird single quote escape mechanism because string interpolation does
# not work on single quote in bash
curl -H "Authorization: Bearer ${{ secrets.API_GITHUB_TOKEN }}" https://api.github.com/repos/huggingface/api-inference/actions/workflows/update_community.yaml/dispatches -d '{"ref":"main","inputs":{"framework":"EDSNLP","tag": "'"${EDSNLP_CPU_TAG}"'"}}'

26 changes: 26 additions & 0 deletions .github/workflows/python-api-edsnlp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: edsnlp-docker

on:
pull_request:
paths:
- "docker_images/edsnlp/**"
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: Checkout
uses: actions/checkout@v2
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Install dependencies
run: |
pip install --upgrade pip
pip install pytest pillow httpx
pip install -e .
- run: RUN_DOCKER_TESTS=1 pytest -sv tests/test_dockers.py::DockerImageTests::test_edsnlp
30 changes: 30 additions & 0 deletions docker_images/edsnlp/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM tiangolo/uvicorn-gunicorn:python3.8
LABEL maintainer="Omar Sanseviero [email protected]"

# Add any system dependency here
# RUN apt-get update -y && apt-get install libXXX -y

COPY ./requirements.txt /app
RUN pip install --no-cache-dir -r requirements.txt
COPY ./prestart.sh /app/


# Most DL models are quite large in terms of memory, using workers is a HUGE
# slowdown because of the fork and GIL with python.
# Using multiple pods seems like a better default strategy.
# Feel free to override if it does not make sense for your library.
ARG max_workers=1
ENV MAX_WORKERS=$max_workers
ENV HUGGINGFACE_HUB_CACHE=/data
ENV PIP_CACHE=/data

# Necessary on GPU environment docker.
# TIMEOUT env variable is used by nvcr.io/nvidia/pytorch:xx for another purpose
# rendering TIMEOUT defined by uvicorn impossible to use correctly
# We're overriding it to be renamed UVICORN_TIMEOUT
# UVICORN_TIMEOUT is a useful variable for very large models that take more
# than 30s (the default) to load in memory.
# If UVICORN_TIMEOUT is too low, uvicorn will simply never loads as it will
# kill workers all the time before they finish.
RUN sed -i 's/TIMEOUT/UVICORN_TIMEOUT/g' /gunicorn_conf.py
COPY ./app /app/app
Empty file.
95 changes: 95 additions & 0 deletions docker_images/edsnlp/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import functools
import logging
import os
from typing import Dict, Type

from api_inference_community.routes import pipeline_route, status_ok
from app.pipelines import (
Pipeline,
TokenClassificationPipeline,
)
from starlette.applications import Starlette
from starlette.middleware import Middleware
from starlette.middleware.gzip import GZipMiddleware
from starlette.routing import Route


TASK = os.getenv("TASK")
MODEL_ID = os.getenv("MODEL_ID")


logger = logging.getLogger(__name__)


# Add the allowed tasks
# Supported tasks are:
# - text-generation
# - text-classification
# - token-classification
# - translation
# - summarization
# - automatic-speech-recognition
# - sentence-similarity
# - ...
# For instance
# from app.pipelines import AutomaticSpeechRecognitionPipeline
# ALLOWED_TASKS = {"automatic-speech-recognition": AutomaticSpeechRecognitionPipeline}
# You can check the requirements and expectations of each pipelines in their respective
# directories. Implement directly within the directories.
ALLOWED_TASKS: Dict[str, Type[Pipeline]] = {
"token-classification": TokenClassificationPipeline,
}


@functools.lru_cache()
def get_pipeline(task=None, model_id=None) -> Pipeline:
task = task or os.environ["TASK"]
model_id = model_id or os.environ["MODEL_ID"]
if task not in ALLOWED_TASKS:
raise EnvironmentError(f"{task} is not a valid pipeline for model : {model_id}")
return ALLOWED_TASKS[task](model_id)


routes = [
Route("/{whatever:path}", status_ok),
Route("/{whatever:path}", pipeline_route, methods=["POST"]),
]

middleware = [Middleware(GZipMiddleware, minimum_size=1000)]
if os.environ.get("DEBUG", "") == "1":
from starlette.middleware.cors import CORSMiddleware

middleware.append(
Middleware(
CORSMiddleware,
allow_origins=["*"],
allow_headers=["*"],
allow_methods=["*"],
)
)

app = Starlette(routes=routes, middleware=middleware)


@app.on_event("startup")
async def startup_event():
logger = logging.getLogger("uvicorn.access")
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
logger.handlers = [handler]

# Link between `api-inference-community` and framework code.
app.get_pipeline = get_pipeline
try:
get_pipeline()
except Exception:
# We can fail so we can show exception later.
pass


if __name__ == "__main__":
try:
get_pipeline()
except Exception:
# We can fail so we can show exception later.
pass
3 changes: 3 additions & 0 deletions docker_images/edsnlp/app/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from app.pipelines.base import Pipeline, PipelineException # isort:skip

from app.pipelines.token_classification import TokenClassificationPipeline
44 changes: 44 additions & 0 deletions docker_images/edsnlp/app/pipelines/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import datetime
import re
from abc import ABC, abstractmethod
from typing import Any

import edsnlp


class Pipeline(ABC):
def __init__(self, model_id: str):
self.model = edsnlp.load(model_id, auto_update=True, install_dependencies=True)

@abstractmethod
def __call__(self, inputs: Any) -> Any:
raise NotImplementedError("Pipelines should implement a __call__ method")

def parse_inputs(self, text: str):
"""
Parse text with the following format:
"Hello, my name is [John](PER) and I live in [New York](LOC)"
into a Doc object with entities.
"""
new_text = ""
offset = 0
ents = []
for match in re.finditer(r"\[([^\]]*)\] *\(([^\)]*)\)", text):
new_text = new_text + text[offset : match.start(0)]
begin = len(new_text)
new_text = new_text + match.group(1)
end = len(new_text)
offset = match.end(0)
label = match.group(2)
ents.append({"start": begin, "end": end, "label": label or "ent"})
new_text = new_text + text[offset:]
doc = self.model.make_doc(new_text)
doc._.note_datetime = datetime.datetime.now()
doc.ents = [
doc.char_span(ent["start"], ent["end"], ent["label"]) for ent in ents
]
return doc


class PipelineException(Exception):
pass
36 changes: 36 additions & 0 deletions docker_images/edsnlp/app/pipelines/token_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any, Dict, List

from app.pipelines import Pipeline


class TokenClassificationPipeline(Pipeline):
def __call__(self, inputs: str) -> List[Dict[str, Any]]:
"""
Args:
inputs (:obj:`str`):
a string containing some text
Return:
A :obj:`list`:. The object returned should be like [{"entity_group": "XXX", "word": "some word", "start": 3, "end": 6, "score": 0.82}] containing :
- "entity_group": A string representing what the entity is.
- "word": A substring of the original string that was detected as an entity.
- "start": the offset within `input` leading to `answer`. context[start:stop] == word
- "end": the ending offset within `input` leading to `answer`. context[start:stop] === word
- "score": A score between 0 and 1 describing how confident the model is for this entity.
- "value": An optional value for the entity (date, amount, etc.)
"""
doc = self.parse_inputs(inputs)
doc = self.model(doc)

entities = []
for ent in doc.ents:
current_entity = {
"entity_group": ent.label_,
"word": ent.text,
"start": ent.start_char,
"end": ent.end_char,
"score": 1.0,
"value": ent._.value,
}
entities.append(current_entity)

return entities
1 change: 1 addition & 0 deletions docker_images/edsnlp/prestart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python app/main.py
5 changes: 5 additions & 0 deletions docker_images/edsnlp/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
starlette==0.27.0
api-inference-community==0.0.32
huggingface_hub==0.23.0
edsnlp[ml]>=0.12.0
requests==2.31.0
Empty file.
46 changes: 46 additions & 0 deletions docker_images/edsnlp/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from typing import Dict
from unittest import TestCase, skipIf

from app.main import ALLOWED_TASKS, get_pipeline


# Must contain at least one example of each implemented pipeline
# Tests do not check the actual values of the model output, so small dummy
# models are recommended for faster tests.
TESTABLE_MODELS: Dict[str, str] = {
# IMPLEMENT_THIS
# "automatic-speech-recognition": "mysample-ASR",
# "text-generation": "mysample-gpt2",
"token-classification": "AP-HP/dummy-ner",
}


ALL_TASKS = {
"automatic-speech-recognition",
"audio-source-separation",
"feature-extraction",
"image-classification",
"question-answering",
"sentence-similarity",
"text-generation",
"text-to-speech",
}


class PipelineTestCase(TestCase):
@skipIf(
os.path.dirname(os.path.dirname(__file__)).endswith("common"),
"common is a special case",
)
def test_has_at_least_one_task_enabled(self):
self.assertGreater(
len(ALLOWED_TASKS.keys()), 0, "You need to implement at least one task"
)

def test_unsupported_tasks(self):
unsupported_tasks = ALL_TASKS - ALLOWED_TASKS.keys()
for unsupported_task in unsupported_tasks:
with self.subTest(msg=unsupported_task, task=unsupported_task):
with self.assertRaises(EnvironmentError):
get_pipeline(unsupported_task, model_id="XX")
Loading